diff options
author | Gil Pitney <gil.pitney@linaro.org> | 2014-10-28 18:00:42 -0700 |
---|---|---|
committer | Gil Pitney <gil.pitney@linaro.org> | 2014-10-28 18:00:42 -0700 |
commit | 61b2c94d9e64758e55730be6a3fc9006c171db85 (patch) | |
tree | f564f09ebf93ba293dfa225bd374df6f1f37aa01 /src |
Initial Commit: Based on TI OpenCL v0.8, originally based on clover.shamrock_v0.8
This is a continuation of the clover OpenCL project:
http://people.freedesktop.org/~steckdenis/clover
based on the contributions from Texas Instruments for Keystone II DSP device:
git.ti.com/opencl
and adding contributions from Linaro for ARM CPU-only support.
See README.txt for more info, and build instructions.
Signed-off-by: Gil Pitney <gil.pitney@linaro.org>
Diffstat (limited to 'src')
213 files changed, 86981 insertions, 0 deletions
diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..ec5d309 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +CMakeFiles/ +cmake_install.cmake diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..7b60902 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,241 @@ + +if (SHANNON_BUILD) + SET (TARGET_INCLUDES + ${PROJECT_SOURCE_DIR}/init + ${SDK}/sdk + ${SDK}/sdk/config + ${SDK}/sdk/pciedrv + ${SDK}/sdk/cmem + ${SDK}/sdk/bufmgr + ${SDK}/sdk/mailBox + ${SDK}/sdk/dnldmgr ) +# Cross-compiling needs additional paths to find target OS headers +# and non-system headers found on the host (BOOST,GL) +elseif (HAWKING_CROSS_COMPILE) + SET (TARGET_INCLUDES + ${CMAKE_FIND_ROOT_PATH} + ${HOST_USR_INCLUDE_PATH} ) +endif() + + +include_directories ( + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/src + ${LLVM_INCLUDE_DIR} + ${CLANG_INCLUDE_DIRS} + ${CMAKE_CURRENT_BINARY_DIR} + ${PROJECT_SOURCE_DIR}/src/core/dsp/ocl_load/DLOAD_API + ${PROJECT_SOURCE_DIR}/src/core/dsp/ocl_load/DLOAD + ${PROJECT_SOURCE_DIR}/src/llvmopencl + ${TARGET_INCLUDES} + ) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FILE_OFFSET_BITS=64") + +# bfd.h has a check to ensure that config.h is included +# We don't require config.h (autotools) so we bypass this check by defining +# PACKAGE, and PACKAGE_VERSION +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPACKAGE=${PROJECT_NAME} -DPACKAGE_VERSION=${${PROJECT_NAME}_VERSION}") + +# Toggle below if wanting to build with debug +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-inline -g") +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -fno-inline -g") +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + +# Temporary to work around hyperlink problem +set(CMAKE_C_FLAGS "${CMAKE_CFLAGS} -D__ARMv7 -DDEVICE_K2H") + +if (SHANNON_BUILD) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDSPC868X") +endif() + +configure_file(core/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/core/config.h) + +set(COAL_SRC_FILES + api/api_command.cpp + api/api_device.cpp + api/api_event.cpp + api/api_kernel.cpp + api/api_platform.cpp + api/api_program.cpp + api/api_context.cpp + api/api_enqueue.cpp + api/api_flush.cpp + api/api_memory.cpp + api/api_profiling.cpp + api/api_sampler.cpp + api/api_gl.cpp + + core/context.cpp + core/commandqueue.cpp + core/memobject.cpp + core/events.cpp + core/program.cpp + core/compiler.cpp + core/kernel.cpp + core/sampler.cpp + core/object.cpp + core/platform.cpp + core/icd.cpp + core/util.cpp + + core/cpu/buffer.cpp + core/cpu/device.cpp + core/cpu/kernel.cpp + core/cpu/program.cpp + core/cpu/worker.cpp + core/cpu/builtins.cpp + core/cpu/sampler.cpp + + ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.h.embed.h + ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.c.bc.embed.h + ${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_impl.h + ${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_def.h + ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_impl.h + ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_def.h +) + +if (NOT SHAMROCK_BUILD) +list (APPEND COAL_SRC_FILES + core/dsp/genfile_cache.cpp + core/dsp/program.cpp + core/dsp/wga.cpp + core/dsp/driver.cpp + core/dsp/buffer.cpp + core/dsp/device.cpp + core/dsp/kernel.cpp + core/dsp/worker.cpp + + llvmopencl/AllocasToEntry.cc + llvmopencl/BarrierBlock.cc + llvmopencl/BarrierTailReplication.cc + llvmopencl/BreakConstantGEPs.cpp + llvmopencl/CanonicalizeBarriers.cc + llvmopencl/Flatten.cc + llvmopencl/GenerateHeader.cc + llvmopencl/ImplicitLoopBarriers.cc + llvmopencl/IsolateRegions.cc + llvmopencl/Kernel.cc + llvmopencl/LLVMUtils.cc + llvmopencl/LoopBarriers.cc + llvmopencl/ParallelRegion.cc + llvmopencl/PHIsToAllocas.cc + llvmopencl/TargetAddressSpaces.cc + llvmopencl/VariableUniformityAnalysis.cc + llvmopencl/WIVectorize.cc + llvmopencl/Workgroup.cc + llvmopencl/WorkItemAliasAnalysis.cc + llvmopencl/WorkitemHandler.cc + llvmopencl/WorkitemHandlerChooser.cc + llvmopencl/WorkitemLoops.cc + llvmopencl/WorkitemReplication.cc +) +endif(NOT SHAMROCK_BUILD) + +if (SHAMROCK_BUILD) +add_subdirectory(builtins) +endif() +add_subdirectory(runtime) + +set(CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bsymbolic") +add_library(OpenCL SHARED ${COAL_SRC_FILES}) + +set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.h.embed.h + PROPERTIES GENERATED 1) +set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.c.bc.embed.h + PROPERTIES GENERATED 1) +set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_impl.h + PROPERTIES GENERATED 1) +set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_def.h + PROPERTIES GENERATED 1) +set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_impl.h + PROPERTIES GENERATED 1) +set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_def.h + PROPERTIES GENERATED 1) + +add_dependencies(OpenCL generate_stdlib_c) + +if (NOT SHAMROCK_BUILD) +add_dependencies(OpenCL generate_builtins) +add_dependencies(OpenCL oclload) +add_dependencies(OpenCL generate_dsp_builtins) +add_subdirectory(core/dsp/ocl_load) +else() +add_dependencies(generate_builtin_lib generate_bc_files) +add_dependencies(generate_stdlib_c generate_builtin_lib) +endif (NOT SHAMROCK_BUILD) + +if (HAWKING_BUILD) + add_dependencies(OpenCL arm_clocl) +endif() + +if (HAWKING_CROSS_COMPILE OR SHANNON_BUILD) + add_dependencies(OpenCL x86_clocl) +endif() + +SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) + +SET_TARGET_PROPERTIES(OpenCL PROPERTIES + VERSION ${${PROJECT_NAME}_VERSION} + SOVERSION ${${PROJECT_NAME}_SOVERSION} +) + +set_source_files_properties(${COAL_SRC_FILES} + PROPERTIES COMPILE_FLAGS ${LLVM_COMPILE_FLAGS}) + +set_target_properties(OpenCL PROPERTIES + LINK_FLAGS "${LLVM_LDFLAGS}" + LINK_INTERFACE_LIBRARIES "") + +set (LIBS + ${CLANG_LIBS} + ${LLVM_LIBS_CORE} + ${LLVM_LIBS_JIT} + pthread + rt + dl + z + tinfo + m +) + +if (SHANNON_BUILD) + LIST (APPEND LIBS + ${PROJECT_BINARY_DIR}/lib/liboclload.a + ${SDK}/sdk/pciedrv/lib/pciedrv.a` + ${SDK}/sdk/dnldmgr/lib/dnldmgr.a + ${SDK}/sdk/cmem/lib/cmem_drv.a + ${SDK}/sdk/bufmgr/lib/bufmgr.a + ${SDK}/sdk/mailBox/host/lib/mailBox.a + pciaccess + ) +elseif(HAWKING_BUILD) + LIST (APPEND LIBS + ${PROJECT_BINARY_DIR}/lib/liboclload.a + ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmmailbox.so + ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmtransport.so + ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmclient.so + ${LINUX_DEVKIT_ROOT}/usr/lib/libticmem.so + ${LINUX_DEVKIT_ROOT}/usr/lib/libkeystonemmap.so + # We don't really depend on libhyplnk but link against it + # to work around an mscsk issue. + ${LINUX_DEVKIT_ROOT}/usr/lib/libhyplnk_k2h.so) +endif() + +if (NOT SHAMROCK_BUILD) +if (HAWKING_CROSS_COMPILE) + SET(FFI_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libffi.so.6) + SET(BFD_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libbfd.so) + SET(SQLITE3_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libsqlite3.so.0) +else() + find_library(FFI_LIB ffi) + find_library(BFD_LIB bfd) + find_library(SQLITE3_LIB sqlite3) +endif() + +LIST (APPEND LIBS ${FFI_LIB} ${BFD_LIB} ${SQLITE3_LIB}) +endif (NOT SHAMROCK_BUILD) + +TARGET_LINK_LIBRARIES(OpenCL ${LIBS}) +install(TARGETS OpenCL LIBRARY DESTINATION lib ${OCL_FPERMS}) diff --git a/src/api/api_command.cpp b/src/api/api_command.cpp new file mode 100644 index 0000000..e9972c6 --- /dev/null +++ b/src/api/api_command.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_command.cpp + * \brief Command queues + */ + +#include <core/commandqueue.h> +#include <core/deviceinterface.h> +#include <core/context.h> + +#include <CL/cl.h> + +// Command Queue APIs +cl_command_queue +clCreateCommandQueue(cl_context context, + cl_device_id device, + cl_command_queue_properties properties, + cl_int * errcode_ret) +{ + cl_int default_errcode_ret; + + // No errcode_ret ? + if (!errcode_ret) + errcode_ret = &default_errcode_ret; + + if (!device->isA(Coal::Object::T_Device)) + { + *errcode_ret = CL_INVALID_DEVICE; + return 0; + } + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + *errcode_ret = CL_SUCCESS; + Coal::CommandQueue *queue = new Coal::CommandQueue( + (Coal::Context *)context, + (Coal::DeviceInterface *)device, + properties, + errcode_ret); + + if (*errcode_ret != CL_SUCCESS) + { + // Initialization failed, destroy context + delete queue; + return 0; + } + + return (_cl_command_queue *)queue; +} + +cl_int +clRetainCommandQueue(cl_command_queue command_queue) +{ + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + command_queue->reference(); + + return CL_SUCCESS; +} + +cl_int +clReleaseCommandQueue(cl_command_queue command_queue) +{ + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + command_queue->flush(); + + if (command_queue->dereference()) + delete command_queue; + + return CL_SUCCESS; +} + +cl_int +clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + return command_queue->info(param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int +clSetCommandQueueProperty(cl_command_queue command_queue, + cl_command_queue_properties properties, + cl_bool enable, + cl_command_queue_properties * old_properties) +{ + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + return command_queue->setProperty(properties, enable, old_properties); +} diff --git a/src/api/api_context.cpp b/src/api/api_context.cpp new file mode 100644 index 0000000..abe7be6 --- /dev/null +++ b/src/api/api_context.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_context.cpp + * \brief Contexts + */ + +#include <CL/cl.h> +#include <core/context.h> +#include <core/platform.h> +#include <stdlib.h> + +// Context APIs + +cl_context +clCreateContext(const cl_context_properties *properties, + cl_uint num_devices, + const cl_device_id * devices, + void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void * user_data, + cl_int * errcode_ret) +{ + cl_int default_errcode_ret; + + // No errcode_ret ? + if (!errcode_ret) + errcode_ret = &default_errcode_ret; + + if (!devices || + !num_devices || + (!pfn_notify && user_data)) + { + *errcode_ret = CL_INVALID_VALUE; + return 0; + } + + *errcode_ret = CL_SUCCESS; + Coal::Context *ctx = new Coal::Context(properties, num_devices, devices, + pfn_notify, user_data, errcode_ret); + + if (*errcode_ret != CL_SUCCESS) + { + // Initialization failed, destroy context + delete ctx; + return 0; + } + + return (_cl_context *)ctx; +} + +cl_context +clCreateContextFromType(const cl_context_properties *properties, + cl_device_type device_type, + void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void * user_data, + cl_int * errcode_ret) +{ + cl_device_id* devices; + cl_uint num_devices; + cl_int local_error; + cl_context result = NULL; + + local_error = clGetDeviceIDs(&the_platform, device_type, 0, NULL, + &num_devices); + if (!num_devices) { local_error = CL_INVALID_DEVICE; goto bail; } + + devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id)); + if (!devices) { local_error = CL_OUT_OF_HOST_MEMORY; goto bail; } + + local_error = clGetDeviceIDs(&the_platform, device_type, num_devices, + devices, 0); + + if (local_error != CL_SUCCESS) { free (devices); goto bail; } + + result = clCreateContext(properties, num_devices, devices, pfn_notify, user_data, + &local_error); + + free (devices); + +bail: + if (errcode_ret) + *errcode_ret = local_error; + + return result; +} + +cl_int +clRetainContext(cl_context context) +{ + if (!context->isA(Coal::Object::T_Context)) + return CL_INVALID_CONTEXT; + + context->reference(); + + return CL_SUCCESS; +} + +cl_int +clReleaseContext(cl_context context) +{ + if (!context->isA(Coal::Object::T_Context)) + return CL_INVALID_CONTEXT; + + if (context->dereference()) + delete context; + + return CL_SUCCESS; +} + +cl_int +clGetContextInfo(cl_context context, + cl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!context->isA(Coal::Object::T_Context)) + return CL_INVALID_CONTEXT; + + return context->info(param_name, param_value_size, param_value, + param_value_size_ret); +} diff --git a/src/api/api_device.cpp b/src/api/api_device.cpp new file mode 100644 index 0000000..052f0b4 --- /dev/null +++ b/src/api/api_device.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_device.cpp + * \brief Devices + */ + +#include "CL/cl.h" +#include <core/platform.h> +#include <core/deviceinterface.h> + +cl_int +clGetDeviceIDs(cl_platform_id platform, + cl_device_type device_type, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) +{ + /*------------------------------------------------------------------------- + * We currently implement only one platform + *------------------------------------------------------------------------*/ + if (!platform) platform = &the_platform; + + if (platform != &the_platform) return CL_INVALID_PLATFORM; + if (num_entries == 0 && devices != 0) return CL_INVALID_VALUE; + if (num_devices == 0 && devices == 0) return CL_INVALID_VALUE; + + int device_number = platform->getDevices(device_type, + num_entries, devices); + + if (num_devices) *num_devices = device_number; + + if (device_number == 0) + return CL_DEVICE_NOT_FOUND; + + return CL_SUCCESS; +} + +cl_int +clGetDeviceInfo(cl_device_id device, + cl_device_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!device->isA(Coal::Object::T_Device)) + return CL_INVALID_DEVICE; + + Coal::DeviceInterface *iface = (Coal::DeviceInterface *)device; + return iface->info(param_name, param_value_size, param_value, + param_value_size_ret); +} diff --git a/src/api/api_enqueue.cpp b/src/api/api_enqueue.cpp new file mode 100644 index 0000000..5ed3b1a --- /dev/null +++ b/src/api/api_enqueue.cpp @@ -0,0 +1,823 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_enqueue.cpp + * \brief Events + */ + +#include <CL/cl.h> + +#include <core/events.h> +#include <core/memobject.h> + +#include <cstdlib> +#include <stdio.h> + +static inline cl_int queueEvent(Coal::CommandQueue *queue, + Coal::Event *command, + cl_event *event, + cl_bool blocking) +{ + cl_int rs; + Coal::Event *old_event = NULL; + + if (event) + { +#if 0 + /*--------------------------------------------------------------------- + * It is up to the user to release events for reuse. If they do not + * they will have a memory leak for old events. This can impact + * memory performance since the old event memory is likely already warm + * in cache. + *--------------------------------------------------------------------*/ + /*--------------------------------------------------------------------- + * We should also reduce the reference count of the old event, because + * user_app_event is now interested in a different event. + *--------------------------------------------------------------------*/ + old_event = *event; + if (old_event != NULL && old_event->isA(Coal::Object::T_Event)) + clReleaseEvent((cl_event)old_event); + +#endif + /*--------------------------------------------------------------------- + * We need to increase reference count before queue->queueEvent(command) + * because a user_app_event is interested in the status of command. + * Otherwise, if worker thread runs too fast, command becomes COMPLETE + * before we get here, command would have been cleaned from queue and + * deleted!!! Thus we will be left with a dangling pointer. + *--------------------------------------------------------------------*/ + *event = (cl_event)command; + command->reference(); + } + + /*------------------------------------------------------------------------ + * Same reason as above. We need to retain command for clWaitForEvents(). + *-----------------------------------------------------------------------*/ + if (blocking) command->reference(); + + rs = queue->queueEvent(command); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + if (blocking) + { + rs = clWaitForEvents(1, (cl_event *)&command); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + command->dereference(); + } + + return CL_SUCCESS; +} + +// Enqueued Commands APIs +cl_int +clEnqueueReadBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t cb, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::ReadBufferEvent *command = new Coal::ReadBufferEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)buffer, + offset, cb, ptr, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, blocking_read); +} + +cl_int +clEnqueueWriteBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t cb, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::WriteBufferEvent *command = new Coal::WriteBufferEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)buffer, + offset, cb, (void *)ptr, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, blocking_write); +} + +cl_int +clEnqueueReadBufferRect(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + const size_t * buffer_origin, + const size_t * host_origin, + const size_t * region, + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::ReadBufferRectEvent *command = new Coal::ReadBufferRectEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)buffer, + buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, + host_row_pitch, host_slice_pitch, ptr, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, blocking_read); +} + +cl_int +clEnqueueWriteBufferRect(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + const size_t * buffer_origin, + const size_t * host_origin, + const size_t * region, + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::WriteBufferRectEvent *command = new Coal::WriteBufferRectEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)buffer, + buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, + host_row_pitch, host_slice_pitch, (void *)ptr, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, blocking_write); +} + +cl_int +clEnqueueCopyBufferRect(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + const size_t * src_origin, + const size_t * dst_origin, + const size_t * region, + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::CopyBufferRectEvent *command = new Coal::CopyBufferRectEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)src_buffer, + (Coal::MemObject *)dst_buffer, + src_origin, dst_origin, region, src_row_pitch, src_slice_pitch, + dst_row_pitch, dst_slice_pitch, 1, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueCopyBuffer(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + size_t src_offset, + size_t dst_offset, + size_t cb, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::CopyBufferEvent *command = new Coal::CopyBufferEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)src_buffer, + (Coal::MemObject *)dst_buffer, + src_offset, dst_offset, cb, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueReadImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_read, + const size_t * origin, + const size_t * region, + size_t row_pitch, + size_t slice_pitch, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + if (!image || (image->type() != Coal::MemObject::Image2D && + image->type() != Coal::MemObject::Image3D)) + return CL_INVALID_MEM_OBJECT; + + Coal::ReadImageEvent *command = new Coal::ReadImageEvent( + (Coal::CommandQueue *)command_queue, + (Coal::Image2D *)image, + origin, region, row_pitch, slice_pitch, (void *)ptr, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, blocking_read); +} + +cl_int +clEnqueueWriteImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_write, + const size_t * origin, + const size_t * region, + size_t row_pitch, + size_t slice_pitch, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::WriteImageEvent *command = new Coal::WriteImageEvent( + (Coal::CommandQueue *)command_queue, + (Coal::Image2D *)image, + origin, region, row_pitch, slice_pitch, (void *)ptr, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, blocking_write); +} + +cl_int +clEnqueueCopyImage(cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_image, + const size_t * src_origin, + const size_t * dst_origin, + const size_t * region, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::CopyImageEvent *command = new Coal::CopyImageEvent( + (Coal::CommandQueue *)command_queue, + (Coal::Image2D *)src_image, (Coal::Image2D *)dst_image, + src_origin, dst_origin, region, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueCopyImageToBuffer(cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_buffer, + const size_t * src_origin, + const size_t * region, + size_t dst_offset, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::CopyImageToBufferEvent *command = new Coal::CopyImageToBufferEvent( + (Coal::CommandQueue *)command_queue, + (Coal::Image2D *)src_image, (Coal::MemObject *)dst_buffer, + src_origin, region, dst_offset, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueCopyBufferToImage(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_image, + size_t src_offset, + const size_t * dst_origin, + const size_t * region, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::CopyBufferToImageEvent *command = new Coal::CopyBufferToImageEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)src_buffer, (Coal::Image2D *)dst_image, + src_offset, dst_origin, region, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +void * +clEnqueueMapBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t cb, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + *errcode_ret = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + { + *errcode_ret = CL_INVALID_COMMAND_QUEUE; + return 0; + } + + Coal::MapBufferEvent *command = new Coal::MapBufferEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)buffer, + offset, cb, map_flags, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, errcode_ret + ); + + if (*errcode_ret != CL_SUCCESS) + { + delete command; + return 0; + } + + // We need command to be valid after queueEvent, so don't let the command + // queue handle it like a fire-and-forget event. Fixes a crash when event + // is NULL : the event gets deleted by clReleaseEvent called from + // CPUDevice's worker() and we then try to read it in command->ptr(); + command->reference(); + + *errcode_ret = queueEvent(command_queue, command, event, blocking_map); + + if (*errcode_ret != CL_SUCCESS) + { + delete command; + return 0; + } + else + { + void *rs = command->ptr(); + + clReleaseEvent((cl_event)command); + + return rs; + } +} + +void * +clEnqueueMapImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_map, + cl_map_flags map_flags, + const size_t * origin, + const size_t * region, + size_t * image_row_pitch, + size_t * image_slice_pitch, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event, + cl_int * errcode_ret) +{ + cl_int rs; + + if (!errcode_ret) + errcode_ret = &rs; + + *errcode_ret = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + { + *errcode_ret = CL_INVALID_COMMAND_QUEUE; + return 0; + } + + Coal::MapImageEvent *command = new Coal::MapImageEvent( + (Coal::CommandQueue *)command_queue, + (Coal::Image2D *)image, + map_flags, origin, region, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, errcode_ret + ); + + if (*errcode_ret != CL_SUCCESS) + { + delete command; + return 0; + } + + if (!image_row_pitch || + (image->type() == Coal::MemObject::Image3D && !image_slice_pitch)) + { + *errcode_ret = CL_INVALID_VALUE; + delete command; + return 0; + } + + command->reference(); // See clEnqueueMapImage for explanation. + *errcode_ret = queueEvent(command_queue, command, event, blocking_map); + + if (*errcode_ret != CL_SUCCESS) + { + delete command; + return 0; + } + else + { + *image_row_pitch = command->row_pitch(); + + if (image_slice_pitch) + *image_slice_pitch = command->slice_pitch(); + + void *rs = command->ptr(); + + clReleaseEvent((cl_event)command); + + return rs; + } +} + +cl_int +clEnqueueUnmapMemObject(cl_command_queue command_queue, + cl_mem memobj, + void * mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + { + return CL_INVALID_COMMAND_QUEUE; + } + + Coal::UnmapBufferEvent *command = new Coal::UnmapBufferEvent( + (Coal::CommandQueue *)command_queue, + (Coal::MemObject *)memobj, + mapped_ptr, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueNDRangeKernel(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t * global_work_offset, + const size_t * global_work_size, + const size_t * local_work_size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + { + return CL_INVALID_COMMAND_QUEUE; + } + + Coal::KernelEvent *command = new Coal::KernelEvent( + (Coal::CommandQueue *)command_queue, + (Coal::Kernel *)kernel, + work_dim, global_work_offset, global_work_size, local_work_size, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueTask(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + { + return CL_INVALID_COMMAND_QUEUE; + } + + Coal::TaskEvent *command = new Coal::TaskEvent( + (Coal::CommandQueue *)command_queue, + (Coal::Kernel *)kernel, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueNativeKernel(cl_command_queue command_queue, + void (*user_func)(void *), + void * args, + size_t cb_args, + cl_uint num_mem_objects, + const cl_mem * mem_list, + const void ** args_mem_loc, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::NativeKernelEvent *command = new Coal::NativeKernelEvent( + (Coal::CommandQueue *)command_queue, + user_func, args, cb_args, num_mem_objects, + (const Coal::MemObject **)mem_list, args_mem_loc, + num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs + ); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueMarker(cl_command_queue command_queue, + cl_event * event) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + if (!event) + return CL_INVALID_VALUE; + + // Get the events in command_queue + unsigned int count; + Coal::Event **events = command_queue->events(count, false); + + Coal::MarkerEvent *command = new Coal::MarkerEvent( + (Coal::CommandQueue *)command_queue, + count, count == 0 ? NULL : (const Coal::Event **)events, &rs); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + // Free events, they were memcpyed by Coal::Event + for (unsigned int i=0; i<count; ++i) + { + events[i]->dereference(); + } + + if (events != NULL) std::free(events); + + return queueEvent(command_queue, command, event, false); +} + +cl_int +clEnqueueWaitForEvents(cl_command_queue command_queue, + cl_uint num_events, + const cl_event * event_list) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::WaitForEventsEvent *command = new Coal::WaitForEventsEvent( + (Coal::CommandQueue *)command_queue, + num_events, (const Coal::Event **)event_list, &rs); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, 0, false); +} + +cl_int +clEnqueueBarrier(cl_command_queue command_queue) +{ + cl_int rs = CL_SUCCESS; + + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + Coal::BarrierEvent *command = new Coal::BarrierEvent( + (Coal::CommandQueue *)command_queue, &rs); + + if (rs != CL_SUCCESS) + { + delete command; + return rs; + } + + return queueEvent(command_queue, command, 0, false); +} diff --git a/src/api/api_event.cpp b/src/api/api_event.cpp new file mode 100644 index 0000000..1e882bf --- /dev/null +++ b/src/api/api_event.cpp @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_event.cpp + * \brief Special events and event management + */ + +#include <CL/cl.h> + +#include <core/commandqueue.h> +#include <core/events.h> +#include <core/context.h> +#include <stdio.h> + +// Event Object APIs +cl_int +clWaitForEvents(cl_uint num_events, + const cl_event * event_list) +{ + if (!num_events || !event_list) + return CL_INVALID_VALUE; + + // Check the events in the list + cl_context global_ctx = 0; + + for (cl_uint i=0; i<num_events; ++i) + { + if (!event_list[i]->isA(Coal::Object::T_Event)) + return CL_INVALID_EVENT; + + if (event_list[i]->status() < 0) + return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; + + cl_context evt_ctx = (cl_context)event_list[i]->parent()->parent(); + +#if 0 // YUAN: no need to wait for queue to be flushed + cl_command_queue evt_queue = (cl_command_queue)event_list[i]->parent(); + // Flush the queue + evt_queue->flush(); +#endif + + if (global_ctx == 0) + global_ctx = evt_ctx; + else if (global_ctx != evt_ctx) + return CL_INVALID_CONTEXT; + } + + // Wait for the events + for (cl_uint i=0; i<num_events; ++i) + { + event_list[i]->waitForStatus(Coal::Event::Complete); + } + + return CL_SUCCESS; +} + +cl_int +clGetEventInfo(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!event->isA(Coal::Object::T_Event)) + return CL_INVALID_EVENT; + + return event->info(param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int +clSetEventCallback(cl_event event, + cl_int command_exec_callback_type, + void (CL_CALLBACK *pfn_event_notify)(cl_event event, + cl_int exec_status, + void *user_data), + void *user_data) +{ + if (!event->isA(Coal::Object::T_Event)) + return CL_INVALID_EVENT; + + if (!pfn_event_notify || command_exec_callback_type != CL_COMPLETE) + return CL_INVALID_VALUE; + + event->setCallback(command_exec_callback_type, pfn_event_notify, user_data); + + return CL_SUCCESS; +} + +cl_int +clRetainEvent(cl_event event) +{ + if (!event->isA(Coal::Object::T_Event)) + return CL_INVALID_EVENT; + + event->reference(); + + return CL_SUCCESS; +} + +cl_int +clReleaseEvent(cl_event event) +{ + if (!event->isA(Coal::Object::T_Event)) + return CL_INVALID_EVENT; + + if (event->dereference()) + { + event->freeDeviceData(); + delete event; + } + + return CL_SUCCESS; +} + +cl_event +clCreateUserEvent(cl_context context, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + *errcode_ret = CL_SUCCESS; + + Coal::UserEvent *command = new Coal::UserEvent( + (Coal::Context *)context, errcode_ret + ); + + if (*errcode_ret != CL_SUCCESS) + { + delete command; + return 0; + } + + return (cl_event)command; +} + +cl_int +clSetUserEventStatus(cl_event event, + cl_int execution_status) +{ + Coal::Event *command = (Coal::Event *)event; + + if (!command->isA(Coal::Object::T_Event) || + command->type() != Coal::Event::User) + return CL_INVALID_EVENT; + + if (execution_status != CL_COMPLETE) + return CL_INVALID_VALUE; + + if (command->status() != CL_SUBMITTED) + return CL_INVALID_OPERATION; + + command->setStatus((Coal::Event::Status)execution_status); + + return CL_SUCCESS; +} diff --git a/src/api/api_flush.cpp b/src/api/api_flush.cpp new file mode 100644 index 0000000..c0e93a7 --- /dev/null +++ b/src/api/api_flush.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_flush.cpp + * \brief clFlush and clFinish + */ + +#include "CL/cl.h" +#include "core/commandqueue.h" + +// Flush and Finish APIs +cl_int +clFlush(cl_command_queue command_queue) +{ + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + command_queue->flush(); + + return CL_SUCCESS; +} + +cl_int +clFinish(cl_command_queue command_queue) +{ + if (!command_queue->isA(Coal::Object::T_CommandQueue)) + return CL_INVALID_COMMAND_QUEUE; + + command_queue->finish(); + + return CL_SUCCESS; +} diff --git a/src/api/api_gl.cpp b/src/api/api_gl.cpp new file mode 100644 index 0000000..0f06499 --- /dev/null +++ b/src/api/api_gl.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_gl.cpp + * \brief OpenGL bindings (unimplemented) + */ + +#define GL_GLEXT_PROTOTYPES +#include "GL/gl.h" +#include "GL/glext.h" + +#include "CL/cl.h" +#include "CL/cl_gl.h" + +cl_mem +clCreateFromGLBuffer(cl_context context, + cl_mem_flags flags, + GLuint bufobj, + int * errcode_ret) +{ + return 0; +} + +cl_mem +clCreateFromGLTexture2D(cl_context context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texture, + int * errcode_ret) +{ + return 0; +} + +cl_mem +clCreateFromGLTexture3D(cl_context context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texture, + int * errcode_ret) +{ + return 0; +} + +cl_mem +clCreateFromGLRenderbuffer(cl_context context, + cl_mem_flags flags, + GLuint renderbuffer, + int * errcode_ret) +{ + return 0; +} + +cl_int +clGetGLObjectInfo(cl_mem memobj, + cl_gl_object_type * gl_object_type, + GLuint * gl_object_name) +{ + return 0; +} + +cl_int +clGetGLTextureInfo(cl_mem memobj, + cl_gl_texture_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + return 0; +} + +cl_int +clEnqueueAcquireGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + return 0; +} + +cl_int +clEnqueueReleaseGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) +{ + return 0; +} diff --git a/src/api/api_kernel.cpp b/src/api/api_kernel.cpp new file mode 100644 index 0000000..abc492b --- /dev/null +++ b/src/api/api_kernel.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_kernel.cpp + * \brief Kernels + */ + +#include "CL/cl.h" + +#include <core/program.h> +#include <core/kernel.h> + +// Kernel Object APIs +cl_kernel +clCreateKernel(cl_program program, + const char * kernel_name, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!kernel_name) + { + *errcode_ret = CL_INVALID_VALUE; + return 0; + } + + if (!program->isA(Coal::Object::T_Program)) + { + *errcode_ret = CL_INVALID_PROGRAM; + return 0; + } + + if (program->state() != Coal::Program::Built) + { + *errcode_ret = CL_INVALID_PROGRAM_EXECUTABLE; + return 0; + } + + //Coal::Kernel *kernel = program->createKernel(kernel_name, errcode_ret); + Coal::Kernel *kernel = program->createKernelsAndReturnKernel(kernel_name, errcode_ret); + + if (*errcode_ret != CL_SUCCESS) + { + delete kernel; + return 0; + } + + return (cl_kernel)kernel; +} + +cl_int +clCreateKernelsInProgram(cl_program program, + cl_uint num_kernels, + cl_kernel * kernels, + cl_uint * num_kernels_ret) +{ + cl_int rs = CL_SUCCESS; + + if (!program->isA(Coal::Object::T_Program)) + return CL_INVALID_PROGRAM; + + if (program->state() != Coal::Program::Built) + return CL_INVALID_PROGRAM_EXECUTABLE; + + std::vector<Coal::Kernel *> ks = program->createKernels(&rs); + + if (rs != CL_SUCCESS) + { + while (ks.size()) + { + delete ks.back(); + ks.pop_back(); + } + + return rs; + } + + // Check that the kernels will fit in the array, if needed + if (num_kernels_ret) + *num_kernels_ret = ks.size(); + + if (kernels && num_kernels < ks.size()) + { + while (ks.size()) + { + delete ks.back(); + ks.pop_back(); + } + + return CL_INVALID_VALUE; + } + + if (!kernels) + { + // We don't need the kernels in fact + /* while (ks.size()) + { + delete ks.back(); + ks.pop_back(); + } */ + } + else + { + // Copy the kernels + for (size_t i=0; i<ks.size(); ++i) + { + kernels[i] = (cl_kernel)ks[i]; + } + } + + return CL_SUCCESS; +} + +cl_int +clRetainKernel(cl_kernel kernel) +{ + if (!kernel->isA(Coal::Object::T_Kernel)) + return CL_INVALID_KERNEL; + + kernel->reference(); + + return CL_SUCCESS; +} + +cl_int +clReleaseKernel(cl_kernel kernel) +{ + if (!kernel->isA(Coal::Object::T_Kernel)) + return CL_INVALID_KERNEL; + + if (kernel->dereference()) + { + Coal::Program *p =(Coal::Program *) kernel->parent(); + + for (size_t i=0; i < p->kernelList.size(); i++) + { + if (p->kernelList[i]->p_name.compare(kernel->p_name) == 0) + { + p->kernelReleasedList.push_back(p->kernelList[i]); + p->kernelList.erase(p->kernelList.begin() + i); + // BUG: TAG + // For some odd reason when we delete this, we're corrupting then inside of some kernel objects + //delete kernel; + } + } + } + + return CL_SUCCESS; +} + +cl_int +clSetKernelArg(cl_kernel kernel, + cl_uint arg_indx, + size_t arg_size, + const void * arg_value) +{ + if (!kernel->isA(Coal::Object::T_Kernel)) + return CL_INVALID_KERNEL; + + return kernel->setArg(arg_indx, arg_size, arg_value); +} + +cl_int +clGetKernelInfo(cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!kernel->isA(Coal::Object::T_Kernel)) + return CL_INVALID_KERNEL; + + return kernel->info(param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int +clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!kernel->isA(Coal::Object::T_Kernel)) + return CL_INVALID_KERNEL; + + return kernel->workGroupInfo((Coal::DeviceInterface *)device, param_name, + param_value_size, param_value, + param_value_size_ret); +} diff --git a/src/api/api_memory.cpp b/src/api/api_memory.cpp new file mode 100644 index 0000000..18e6bab --- /dev/null +++ b/src/api/api_memory.cpp @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_memory.cpp + * \brief Memory objects + */ + +#include "CL/cl.h" +#include <core/memobject.h> +#include <core/context.h> + +#include <cstring> + +// Memory Object APIs +cl_mem +clCreateBuffer(cl_context context, + cl_mem_flags flags, + size_t size, + void * host_ptr, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + *errcode_ret = CL_SUCCESS; + + Coal::Buffer *buf = new Coal::Buffer(context, size, host_ptr, flags, + errcode_ret); + + if (*errcode_ret != CL_SUCCESS || (*errcode_ret = buf->init()) != CL_SUCCESS) + { + delete buf; + return 0; + } + + return (cl_mem)buf; +} + +cl_mem +clCreateSubBuffer(cl_mem buffer, + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!buffer->isA(Coal::Object::T_MemObject)) + { + *errcode_ret = CL_INVALID_MEM_OBJECT; + return 0; + } + + Coal::MemObject *memobject = (Coal::MemObject *)buffer; + cl_buffer_region *region = (cl_buffer_region *)buffer_create_info; + + // NOTE: Is it right ? Couldn't we create SubBuffers of images ? + if (memobject->type() != Coal::MemObject::Buffer) + { + *errcode_ret = CL_INVALID_MEM_OBJECT; + return 0; + } + + if (buffer_create_type != CL_BUFFER_CREATE_TYPE_REGION) + { + *errcode_ret = CL_INVALID_VALUE; + return 0; + } + + if (!buffer_create_info) + { + *errcode_ret = CL_INVALID_VALUE; + return 0; + } + + *errcode_ret = CL_SUCCESS; + + Coal::SubBuffer *buf = new Coal::SubBuffer((Coal::Buffer *)buffer, + region->origin, region->size, + flags, errcode_ret); + + if (*errcode_ret != CL_SUCCESS || (*errcode_ret = buf->init()) != CL_SUCCESS) + { + delete buf; + return 0; + } + + return (cl_mem)buf; +} + +cl_mem +clCreateImage2D(cl_context context, + cl_mem_flags flags, + const cl_image_format * image_format, + size_t image_width, + size_t image_height, + size_t image_row_pitch, + void * host_ptr, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + *errcode_ret = CL_SUCCESS; + + Coal::Image2D *image = new Coal::Image2D(context, image_width, image_height, + image_row_pitch, image_format, + host_ptr, flags, errcode_ret); + + if (*errcode_ret != CL_SUCCESS || (*errcode_ret = image->init()) != CL_SUCCESS) + { + delete image; + return 0; + } + + return (cl_mem)image; +} + +cl_mem +clCreateImage3D(cl_context context, + cl_mem_flags flags, + const cl_image_format * image_format, + size_t image_width, + size_t image_height, + size_t image_depth, + size_t image_row_pitch, + size_t image_slice_pitch, + void * host_ptr, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + *errcode_ret = CL_SUCCESS; + + Coal::Image3D *image = new Coal::Image3D(context, image_width, image_height, + image_depth, image_row_pitch, + image_slice_pitch, image_format, + host_ptr, flags, errcode_ret); + + if (*errcode_ret != CL_SUCCESS || (*errcode_ret = image->init()) != CL_SUCCESS) + { + delete image; + return 0; + } + + return (cl_mem)image; +} + +cl_int +clRetainMemObject(cl_mem memobj) +{ + if (!memobj->isA(Coal::Object::T_MemObject)) + return CL_INVALID_MEM_OBJECT; + + memobj->reference(); + + return CL_SUCCESS; +} + +cl_int +clReleaseMemObject(cl_mem memobj) +{ + if (!memobj->isA(Coal::Object::T_MemObject)) + return CL_INVALID_MEM_OBJECT; + + if (memobj->dereference()) + delete memobj; + + return CL_SUCCESS; +} + +static cl_image_format supported_formats[] = { + { CL_RGBA, CL_UNORM_INT8 }, + { CL_RGBA, CL_UNORM_INT16 }, + { CL_RGBA, CL_SNORM_INT8 }, + { CL_RGBA, CL_SNORM_INT16 }, + { CL_RGBA, CL_SIGNED_INT8 }, + { CL_RGBA, CL_SIGNED_INT16 }, + { CL_RGBA, CL_SIGNED_INT32 }, + { CL_RGBA, CL_UNSIGNED_INT8 }, + { CL_RGBA, CL_UNSIGNED_INT16 }, + { CL_RGBA, CL_UNSIGNED_INT32 }, + { CL_RGBA, CL_FLOAT }, + + { CL_ARGB, CL_UNORM_INT8 }, + { CL_ARGB, CL_SNORM_INT8 }, + { CL_ARGB, CL_SIGNED_INT8 }, + { CL_ARGB, CL_UNSIGNED_INT8 }, + + { CL_BGRA, CL_UNORM_INT8 }, + { CL_BGRA, CL_SNORM_INT8 }, + { CL_BGRA, CL_SIGNED_INT8 }, + { CL_BGRA, CL_UNSIGNED_INT8 }, + + { CL_RGB, CL_UNORM_SHORT_565 }, + { CL_RGB, CL_UNORM_SHORT_555 }, + { CL_RGB, CL_UNORM_INT_101010 }, + + { CL_RGBx, CL_UNORM_SHORT_565 }, + { CL_RGBx, CL_UNORM_SHORT_555 }, + { CL_RGBx, CL_UNORM_INT_101010 }, + + { CL_RG, CL_UNORM_INT8 }, + { CL_RG, CL_UNORM_INT16 }, + { CL_RG, CL_SNORM_INT8 }, + { CL_RG, CL_SNORM_INT16 }, + { CL_RG, CL_SIGNED_INT8 }, + { CL_RG, CL_SIGNED_INT16 }, + { CL_RG, CL_SIGNED_INT32 }, + { CL_RG, CL_UNSIGNED_INT8 }, + { CL_RG, CL_UNSIGNED_INT16 }, + { CL_RG, CL_UNSIGNED_INT32 }, + { CL_RG, CL_FLOAT }, + + { CL_RGx, CL_UNORM_INT8 }, + { CL_RGx, CL_UNORM_INT16 }, + { CL_RGx, CL_SNORM_INT8 }, + { CL_RGx, CL_SNORM_INT16 }, + { CL_RGx, CL_SIGNED_INT8 }, + { CL_RGx, CL_SIGNED_INT16 }, + { CL_RGx, CL_SIGNED_INT32 }, + { CL_RGx, CL_UNSIGNED_INT8 }, + { CL_RGx, CL_UNSIGNED_INT16 }, + { CL_RGx, CL_UNSIGNED_INT32 }, + { CL_RGx, CL_FLOAT }, + + { CL_RA, CL_UNORM_INT8 }, + { CL_RA, CL_UNORM_INT16 }, + { CL_RA, CL_SNORM_INT8 }, + { CL_RA, CL_SNORM_INT16 }, + { CL_RA, CL_SIGNED_INT8 }, + { CL_RA, CL_SIGNED_INT16 }, + { CL_RA, CL_SIGNED_INT32 }, + { CL_RA, CL_UNSIGNED_INT8 }, + { CL_RA, CL_UNSIGNED_INT16 }, + { CL_RA, CL_UNSIGNED_INT32 }, + { CL_RA, CL_FLOAT }, + + { CL_R, CL_UNORM_INT8 }, + { CL_R, CL_UNORM_INT16 }, + { CL_R, CL_SNORM_INT8 }, + { CL_R, CL_SNORM_INT16 }, + { CL_R, CL_SIGNED_INT8 }, + { CL_R, CL_SIGNED_INT16 }, + { CL_R, CL_SIGNED_INT32 }, + { CL_R, CL_UNSIGNED_INT8 }, + { CL_R, CL_UNSIGNED_INT16 }, + { CL_R, CL_UNSIGNED_INT32 }, + { CL_R, CL_FLOAT }, + + { CL_Rx, CL_UNORM_INT8 }, + { CL_Rx, CL_UNORM_INT16 }, + { CL_Rx, CL_SNORM_INT8 }, + { CL_Rx, CL_SNORM_INT16 }, + { CL_Rx, CL_SIGNED_INT8 }, + { CL_Rx, CL_SIGNED_INT16 }, + { CL_Rx, CL_SIGNED_INT32 }, + { CL_Rx, CL_UNSIGNED_INT8 }, + { CL_Rx, CL_UNSIGNED_INT16 }, + { CL_Rx, CL_UNSIGNED_INT32 }, + { CL_Rx, CL_FLOAT }, + + { CL_A, CL_UNORM_INT8 }, + { CL_A, CL_UNORM_INT16 }, + { CL_A, CL_SNORM_INT8 }, + { CL_A, CL_SNORM_INT16 }, + { CL_A, CL_SIGNED_INT8 }, + { CL_A, CL_SIGNED_INT16 }, + { CL_A, CL_SIGNED_INT32 }, + { CL_A, CL_UNSIGNED_INT8 }, + { CL_A, CL_UNSIGNED_INT16 }, + { CL_A, CL_UNSIGNED_INT32 }, + { CL_A, CL_FLOAT }, + + { CL_LUMINANCE, CL_UNORM_INT8 }, + { CL_LUMINANCE, CL_UNORM_INT16 }, + { CL_LUMINANCE, CL_SNORM_INT8 }, + { CL_LUMINANCE, CL_SNORM_INT16 }, + { CL_LUMINANCE, CL_FLOAT }, + + { CL_INTENSITY, CL_UNORM_INT8 }, + { CL_INTENSITY, CL_UNORM_INT16 }, + { CL_INTENSITY, CL_SNORM_INT8 }, + { CL_INTENSITY, CL_SNORM_INT16 }, + { CL_INTENSITY, CL_FLOAT } +}; + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +cl_int +clGetSupportedImageFormats(cl_context context, + cl_mem_flags flags, + cl_mem_object_type image_type, + cl_uint num_entries, + cl_image_format * image_formats, + cl_uint * num_image_formats) +{ + if (!context->isA(Coal::Object::T_Context)) + return CL_INVALID_CONTEXT; + + (void) flags; + (void) image_type; + + if (!num_entries && image_formats) + return CL_INVALID_VALUE; + + if (image_formats) + { + std::memcpy(image_formats, supported_formats, + MIN(num_entries * sizeof(cl_image_format), + sizeof(supported_formats))); + } + + if (num_image_formats) + *num_image_formats = sizeof(supported_formats) / sizeof(cl_image_format); + + return CL_SUCCESS; +} + +cl_int +clGetMemObjectInfo(cl_mem memobj, + cl_mem_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!memobj->isA(Coal::Object::T_MemObject)) + return CL_INVALID_MEM_OBJECT; + + return memobj->info(param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int +clGetImageInfo(cl_mem image, + cl_image_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!image->isA(Coal::Object::T_MemObject) || + (image->type() != Coal::MemObject::Image2D && + image->type() != Coal::MemObject::Image3D)) + return CL_INVALID_MEM_OBJECT; + + Coal::Image2D *image2d = (Coal::Image2D *)image; + + return image2d->imageInfo(param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int +clSetMemObjectDestructorCallback(cl_mem memobj, + void (CL_CALLBACK *pfn_notify)(cl_mem memobj, + void *user_data), + void * user_data) +{ + if (!memobj->isA(Coal::Object::T_MemObject)) + return CL_INVALID_MEM_OBJECT; + + memobj->setDestructorCallback(pfn_notify, user_data); + + return CL_SUCCESS; +} + diff --git a/src/api/api_platform.cpp b/src/api/api_platform.cpp new file mode 100644 index 0000000..cf064ef --- /dev/null +++ b/src/api/api_platform.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_platform.cpp + * \brief Platform + */ + +#include "CL/cl.h" +#include "CL/cl_ext.h" +#include <core/platform.h> +#include <core/config.h> +#include <cstring> + +// Platform API + +cl_int CL_API_CALL +clGetPlatformIDs(cl_uint num_entries, + cl_platform_id * platforms, + cl_uint * num_platforms) +{ + if (num_platforms) *num_platforms = 1; + else if (!platforms) return CL_INVALID_VALUE; + + if (!num_entries && platforms) return CL_INVALID_VALUE; + + /*------------------------------------------------------------------------- + * Only one "default" platform + *------------------------------------------------------------------------*/ + if (platforms != 0) *platforms = &the_platform; + + return CL_SUCCESS; +} + +cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id platform, + cl_platform_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + const char *string = 0; + unsigned long len = 0; + + /*------------------------------------------------------------------------- + * NULL or what is returned by clGetPlatformIDs, that's to say also NULL + *------------------------------------------------------------------------*/ + if (platform != &the_platform) return CL_INVALID_PLATFORM; + + return platform->info(param_name, param_value_size, param_value, + param_value_size_ret); +} + +/****************************************************************************** +* Return a pointer to any supported extension functions +******************************************************************************/ +void * clGetExtensionFunctionAddress(const char *funcname) +{ + if (strcmp(funcname, "clIcdGetPlatformIDsKHR") == 0) + return (void*)clGetPlatformIDs; + + return NULL; +} + diff --git a/src/api/api_profiling.cpp b/src/api/api_profiling.cpp new file mode 100644 index 0000000..0abec66 --- /dev/null +++ b/src/api/api_profiling.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_profiling.cpp + * \brief Profiling of events + */ + +#include "CL/cl.h" +#include <core/commandqueue.h> + +// Profiling APIs +cl_int +clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!event->isA(Coal::Object::T_Event)) + return CL_INVALID_EVENT; + + return event->profilingInfo(param_name, param_value_size, param_value, + param_value_size_ret); +} + diff --git a/src/api/api_program.cpp b/src/api/api_program.cpp new file mode 100644 index 0000000..af30510 --- /dev/null +++ b/src/api/api_program.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_program.cpp + * \brief Programs + */ + +#include "CL/cl.h" +#include <core/program.h> +#include <core/context.h> + +#include <cstdlib> + +// Program Object APIs +cl_program +clCreateProgramWithSource(cl_context context, + cl_uint count, + const char ** strings, + const size_t * lengths, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + if (!count || !strings) + { + *errcode_ret = CL_INVALID_VALUE; + return 0; + } + + Coal::Program *program = new Coal::Program(context); + + *errcode_ret = CL_SUCCESS; + *errcode_ret = program->loadSources(count, strings, lengths); + + if (*errcode_ret != CL_SUCCESS) + { + delete program; + return 0; + } + + return (cl_program)program; +} + +cl_program +clCreateProgramWithBinary(cl_context context, + cl_uint num_devices, + const cl_device_id * device_list, + const size_t * lengths, + const unsigned char **binaries, + cl_int * binary_status, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + if (!num_devices || !device_list || !lengths || !binaries) + { + *errcode_ret = CL_INVALID_VALUE; + return 0; + } + + // Check the devices for compliance + cl_uint context_num_devices = 0; + cl_device_id *context_devices; + + *errcode_ret = context->info(CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), + &context_num_devices, 0); + + if (*errcode_ret != CL_SUCCESS) + return 0; + + context_devices = + (cl_device_id *)std::malloc(context_num_devices * sizeof(cl_device_id)); + + *errcode_ret = context->info(CL_CONTEXT_DEVICES, + context_num_devices * sizeof(cl_device_id), + context_devices, 0); + + if (*errcode_ret != CL_SUCCESS) + return 0; + + for (cl_uint i=0; i<num_devices; ++i) + { + bool found = false; + + if (!lengths[i] || !binaries[i]) + { + if (binary_status) + binary_status[i] = CL_INVALID_VALUE; + + *errcode_ret = CL_INVALID_VALUE; + return 0; + } + + for (cl_uint j=0; j<context_num_devices; ++j) + { + if (device_list[i] == context_devices[j]) + { + found = true; + break; + } + } + + if (!found) + { + *errcode_ret = CL_INVALID_DEVICE; + return 0; + } + } + + // Create a program + Coal::Program *program = new Coal::Program(context); + *errcode_ret = CL_SUCCESS; + + // Init program + *errcode_ret = program->loadBinaries(binaries, + lengths, binary_status, num_devices, + (Coal::DeviceInterface * const*)device_list); + + if (*errcode_ret != CL_SUCCESS) + { + delete program; + return 0; + } + + return (cl_program)program; +} + +cl_int +clRetainProgram(cl_program program) +{ + if (!program->isA(Coal::Object::T_Program)) + return CL_INVALID_PROGRAM; + + program->reference(); + + return CL_SUCCESS; +} + +cl_int +clReleaseProgram(cl_program program) +{ + if (!program->isA(Coal::Object::T_Program)) + return CL_INVALID_PROGRAM; + + if (program->dereference()) + delete program; + + return CL_SUCCESS; +} + +cl_int +clBuildProgram(cl_program program, + cl_uint num_devices, + const cl_device_id * device_list, + const char * options, + void (*pfn_notify)(cl_program program, void * user_data), + void * user_data) +{ + if (!program->isA(Coal::Object::T_Program)) + return CL_INVALID_PROGRAM; + + if (!device_list && num_devices > 0) + return CL_INVALID_VALUE; + + if (!num_devices && device_list) + return CL_INVALID_VALUE; + + if (!pfn_notify && user_data) + return CL_INVALID_VALUE; + + cl_uint context_num_devices = 0; + cl_device_id *context_devices; + Coal::Context *context = (Coal::Context *)program->parent(); + cl_int result; + + result = context->info(CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), + &context_num_devices, 0); + + if (result != CL_SUCCESS) return result; + + context_devices = + (cl_device_id *)std::malloc(context_num_devices * sizeof(cl_device_id)); + + result = context->info(CL_CONTEXT_DEVICES, + context_num_devices * sizeof(cl_device_id), + context_devices, 0); + + if (result != CL_SUCCESS) return result; + + + // Check the devices for compliance + if (num_devices) + { + for (cl_uint i=0; i<num_devices; ++i) + { + bool found = false; + + for (cl_uint j=0; j<context_num_devices; ++j) + { + if (device_list[i] == context_devices[j]) + { + found = true; + break; + } + } + + if (!found) + return CL_INVALID_DEVICE; + } + } + else + { + num_devices = context_num_devices; + device_list = context_devices; + } + + // We cannot try to build a previously-failed program + if (!(program->state() == Coal::Program::Loaded || + program->state() == Coal::Program::Built )) + return CL_INVALID_OPERATION; + + // Build program + return program->build(options, pfn_notify, user_data, num_devices, + (Coal::DeviceInterface * const*)device_list); +} + +cl_int +clUnloadCompiler(void) +{ + return CL_SUCCESS; +} + +cl_int +clGetProgramInfo(cl_program program, + cl_program_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!program->isA(Coal::Object::T_Program)) + return CL_INVALID_PROGRAM; + + return program->info(param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int +clGetProgramBuildInfo(cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!program->isA(Coal::Object::T_Program)) + return CL_INVALID_PROGRAM; + + if (!device) + return CL_INVALID_DEVICE; + + return program->buildInfo((Coal::DeviceInterface *)device, param_name, + param_value_size, param_value, + param_value_size_ret); +} diff --git a/src/api/api_sampler.cpp b/src/api/api_sampler.cpp new file mode 100644 index 0000000..9bd2dec --- /dev/null +++ b/src/api/api_sampler.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file api_sampler.cpp + * \brief Samplers + */ + +#include "CL/cl.h" + +#include "core/sampler.h" +#include "core/context.h" + +// Sampler APIs +cl_sampler +clCreateSampler(cl_context context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int * errcode_ret) +{ + cl_int dummy_errcode; + + if (!errcode_ret) + errcode_ret = &dummy_errcode; + + if (!context->isA(Coal::Object::T_Context)) + { + *errcode_ret = CL_INVALID_CONTEXT; + return 0; + } + + *errcode_ret = CL_SUCCESS; + + Coal::Sampler *sampler = new Coal::Sampler((Coal::Context *)context, + normalized_coords, + addressing_mode, + filter_mode, + errcode_ret); + + if (*errcode_ret != CL_SUCCESS) + { + delete sampler; + return 0; + } + + return (cl_sampler)sampler; +} + +cl_int +clRetainSampler(cl_sampler sampler) +{ + if (!sampler->isA(Coal::Object::T_Sampler)) + return CL_INVALID_SAMPLER; + + sampler->reference(); + + return CL_SUCCESS; +} + +cl_int +clReleaseSampler(cl_sampler sampler) +{ + if (!sampler->isA(Coal::Object::T_Sampler)) + return CL_INVALID_SAMPLER; + + if (sampler->dereference()) + delete sampler; + + return CL_SUCCESS; +} + +cl_int +clGetSamplerInfo(cl_sampler sampler, + cl_sampler_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) +{ + if (!sampler->isA(Coal::Object::T_Sampler)) + return CL_INVALID_SAMPLER; + + return sampler->info(param_name, param_value_size, param_value, + param_value_size_ret); +} diff --git a/src/builtins/CMakeLists.txt b/src/builtins/CMakeLists.txt new file mode 100644 index 0000000..a83dfdf --- /dev/null +++ b/src/builtins/CMakeLists.txt @@ -0,0 +1,33 @@ +if (SHAMROCK_BUILD) + +set(CUSTOM_COMMAND ${CLANG_EXECUTABLE} -cc1 -emit-llvm-bc -x cl -O2 -fno-builtin -nobuiltininc -Fvisibility=protected -cl-std=CL1.2 -ffp-contract=off ) + +FILE(GLOB CL_SOURCES ${CLC_BUILTINS_DIR}/*.cl) +#MESSAGE(STATUS "CL_SOURCES: ${CL_SOURCES}" ) + +set(BC_SOURCES) +foreach(f ${CL_SOURCES}) + get_filename_component(fn ${f} NAME_WE) + #MESSAGE(STATUS "CL_SOURCE: ${f}" ) + set(bc ${CMAKE_CURRENT_BINARY_DIR}/${fn}.bc) + add_custom_command(OUTPUT ${bc} + COMMAND ${CUSTOM_COMMAND} + -I${OCL_BUILTINS_DIR}/include + -o ${bc} ${f} + DEPENDS ${f} + COMMENT "Generating ${bc}") + list(APPEND BC_SOURCES ${bc}) +endforeach() +#MESSAGE( STATUS "BC_SOURCES: ${BC_SOURCES}") + +add_custom_target(generate_bc_files DEPENDS ${BC_SOURCES}) + +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib + COMMAND llvm-link + -o ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib ${BC_SOURCES} + DEPENDS ${BC_SOURCES} ) + +add_custom_target(generate_builtin_lib DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib) + +endif(SHAMROCK_BUILD) diff --git a/src/builtins/Makefile b/src/builtins/Makefile new file mode 100644 index 0000000..1d3349b --- /dev/null +++ b/src/builtins/Makefile @@ -0,0 +1,24 @@ +CLANG = clang +CLANG_CFLAGS = -cc1 -emit-llvm-bc -x cl -O2 -fno-builtin -nobuiltininc +CLANG_CFLAGS += -Fvisibility=protected -cl-std=CL1.2 -ffp-contract=off +CLANG_CFLAGS += -I../../include + +CL_FILES = $(wildcard *.cl) +BYTECODE := ${CL_FILES:.cl=.bc} + +all: builtins.lib + +builtins.lib: $(BYTECODE) + @echo $@ Linking bytecode modules + llvm-link -o $@ $^ + +%.bc: %.cl + @echo $< Parsing + @$(CLANG) $(CLANG_CFLAGS) $< -o $@ + +%.ll: %.bc + @echo $< Disassembling + llvm-dis $< + +clean: + @rm -rf *.bc *.ll diff --git a/src/builtins/README.txt b/src/builtins/README.txt new file mode 100644 index 0000000..5e16118 --- /dev/null +++ b/src/builtins/README.txt @@ -0,0 +1,13 @@ +This directory (builtins) is intended to supercede src/runtime as a means +to provide a builtins library for OpenCL kernels. + +Note: some of the files here do not compile due to an address space casting +error, and are suffixed *.cl.broken. + +Files here were imported from the TI opencl_builtins private repository and +repurposed for CPU device (from DSP device). + +This library appears to have been adapted from libclc.llvm.org. + +The Makefile here is not used, but available for illustration purposes and +to allow disassmbly of the bc files for inspection. diff --git a/src/builtins/abs.cl b/src/builtins/abs.cl new file mode 100644 index 0000000..71dcf75 --- /dev/null +++ b/src/builtins/abs.cl @@ -0,0 +1,33 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +UNARY_VEC_DEF(char, uchar, abs, abs) +UNARY_VEC_DEF(short, ushort, abs, abs) +UNARY_VEC_DEF(int, uint, abs, abs) +UNARY_VEC_DEF(long, ulong, abs, abs) diff --git a/src/builtins/abs_diff.cl b/src/builtins/abs_diff.cl new file mode 100644 index 0000000..ecc8e37 --- /dev/null +++ b/src/builtins/abs_diff.cl @@ -0,0 +1,72 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +#define EXPAND_SIZES(type, utype) \ + TEMPLATE(_VEC_TYPE(type,2), _VEC_TYPE(utype,2)) \ + TEMPLATE(_VEC_TYPE(type,3), _VEC_TYPE(utype,3)) \ + TEMPLATE(_VEC_TYPE(type,4), _VEC_TYPE(utype,4)) \ + TEMPLATE(_VEC_TYPE(type,8), _VEC_TYPE(utype,8)) \ + TEMPLATE(_VEC_TYPE(type,16), _VEC_TYPE(utype,16)) \ + +#define TEMPLATE(gentype, ugentype) \ + _CLC_OVERLOAD _CLC_DEF ugentype abs_diff(gentype x, gentype y) \ + { return __builtin_astype(x > y ? x-y : y-x, ugentype); } + +EXPAND_SIZES(uchar, uchar) +EXPAND_SIZES(char, uchar) +EXPAND_SIZES(ushort, ushort) +EXPAND_SIZES(short, ushort) +EXPAND_SIZES(uint, uint) +EXPAND_SIZES(ulong, ulong) + +#undef TEMPLATE + +#define TEMPLATE(gentype, ugentype, shiftval) \ +_CLC_OVERLOAD _CLC_DEF ugentype abs_diff(gentype x, gentype y) \ +{ \ + gentype signs_differ = (x^y) >> (gentype)shiftval; \ + return (signs_differ) ? abs(x) + abs(y) : \ + __builtin_astype(x > y ? x-y : y-x, ugentype); \ +} + +TEMPLATE(int, uint, 31) +TEMPLATE(_VEC_TYPE(int,2), _VEC_TYPE(uint,2), 31) +TEMPLATE(_VEC_TYPE(int,3), _VEC_TYPE(uint,3), 31) +TEMPLATE(_VEC_TYPE(int,4), _VEC_TYPE(uint,4), 31) +TEMPLATE(_VEC_TYPE(int,8), _VEC_TYPE(uint,8), 31) +TEMPLATE(_VEC_TYPE(int,16), _VEC_TYPE(uint,16), 31) + +TEMPLATE(long, ulong, 63) +TEMPLATE(_VEC_TYPE(long,2), _VEC_TYPE(ulong,2), 63) +TEMPLATE(_VEC_TYPE(long,3), _VEC_TYPE(ulong,3), 63) +TEMPLATE(_VEC_TYPE(long,4), _VEC_TYPE(ulong,4), 63) +TEMPLATE(_VEC_TYPE(long,8), _VEC_TYPE(ulong,8), 63) +TEMPLATE(_VEC_TYPE(long,16), _VEC_TYPE(ulong,16), 63) + +#undef TEMPLATE diff --git a/src/builtins/add_sat.cl b/src/builtins/add_sat.cl new file mode 100644 index 0000000..e70b3fb --- /dev/null +++ b/src/builtins/add_sat.cl @@ -0,0 +1,37 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +BINARY_VEC_DEF(char, char, add_sat, add_sat) +BINARY_VEC_DEF(uchar, uchar, add_sat, add_sat) +BINARY_VEC_DEF(short, short, add_sat, add_sat) +BINARY_VEC_DEF(ushort, ushort, add_sat, add_sat) +BINARY_VEC_DEF(int, int, add_sat, add_sat) +BINARY_VEC_DEF(uint, uint, add_sat, add_sat) +BINARY_VEC_DEF(long, long, add_sat, add_sat) +BINARY_VEC_DEF(ulong, ulong, add_sat, add_sat) diff --git a/src/builtins/all.cl b/src/builtins/all.cl new file mode 100644 index 0000000..96a9ee2 --- /dev/null +++ b/src/builtins/all.cl @@ -0,0 +1,43 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define TEMPLATE(type) \ +_CLC_OVERLOAD _CLC_DEF int all(type##3 x) { return (x.s0 & x.s1 & x.s2) < 0; } \ +_CLC_OVERLOAD _CLC_DEF int all(type##4 x) { return (x.s0 & x.s1 & x.s2 & x.s3) < 0; } \ +_CLC_OVERLOAD _CLC_DEF int all(type##8 x) { return (x.s0 & x.s1 & x.s2 & x.s3 & \ + x.s4 & x.s5 & x.s6 & x.s7) < 0; } \ +_CLC_OVERLOAD _CLC_DEF int all(type##16 x) { return (x.s0 & x.s1 & x.s2 & x.s3 & \ + x.s4 & x.s5 & x.s6 & x.s7 & \ + x.s8 & x.s9 & x.sa & x.sb & \ + x.sc & x.sd & x.se & x.sf) < 0; } \ + +TEMPLATE(char) +TEMPLATE(short) +TEMPLATE(int) +TEMPLATE(long) diff --git a/src/builtins/any.cl b/src/builtins/any.cl new file mode 100644 index 0000000..57c4419 --- /dev/null +++ b/src/builtins/any.cl @@ -0,0 +1,43 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define TEMPLATE(type) \ +_CLC_OVERLOAD _CLC_DEF int any(type##3 x) { return (x.s0 | x.s1 | x.s2) < 0; } \ +_CLC_OVERLOAD _CLC_DEF int any(type##4 x) { return (x.s0 | x.s1 | x.s2 | x.s3) < 0; } \ +_CLC_OVERLOAD _CLC_DEF int any(type##8 x) { return (x.s0 | x.s1 | x.s2 | x.s3 | \ + x.s4 | x.s5 | x.s6 | x.s7) < 0; } \ +_CLC_OVERLOAD _CLC_DEF int any(type##16 x) { return (x.s0 | x.s1 | x.s2 | x.s3 | \ + x.s4 | x.s5 | x.s6 | x.s7 | \ + x.s8 | x.s9 | x.sa | x.sb | \ + x.sc | x.sd | x.se | x.sf) < 0; } \ + +TEMPLATE(char) +TEMPLATE(short) +TEMPLATE(int) +TEMPLATE(long) diff --git a/src/builtins/atomics.cl.broken b/src/builtins/atomics.cl.broken new file mode 100644 index 0000000..ed46888 --- /dev/null +++ b/src/builtins/atomics.cl.broken @@ -0,0 +1,558 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +void __sem_lock(int); +void __sem_unlock(int); +void __inv(char*, int); + +#define LOCK_GLOBAL __sem_lock(1) +#define UNLOCK_GLOBAL __sem_unlock(1) +#define INV_GLOBAL(p, sz) __inv((char*)(p), (sz)) +#define WB_GLOBAL(p, sz) + +#define LOCK_LOCAL +#define UNLOCK_LOCAL +#define INV_LOCAL(p, sz) +#define WB_LOCAL(p, sz) + +_CLC_OVERLOAD _CLC_DEF int atomic_add(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = old + val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_add(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = old + val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_add(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = old + val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_add(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = old + val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_sub(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = old - val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_sub(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = old - val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_sub(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = old - val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_sub(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = old - val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_xchg(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_xchg(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile global float* p, float val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + float old = *p; + *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_xchg(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_xchg(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile local float* p, float val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + float old = *p; + *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_inc(volatile global int* p) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = old + 1; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_inc(volatile global uint* p) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = old + 1; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_inc(volatile local int* p) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = old + 1; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_inc(volatile local uint* p) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = old + 1; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_dec(volatile global int* p) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = old - 1; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_dec(volatile global uint* p) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = old - 1; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_dec(volatile local int* p) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = old - 1; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_dec(volatile local uint* p) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = old - 1; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_cmpxchg(volatile global int* p, int cmp, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + if (old == cmp) *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_cmpxchg(volatile global uint* p, uint cmp, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + if (old == cmp) *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_cmpxchg(volatile local int* p, int cmp, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + if (old == cmp) *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_cmpxchg(volatile local uint* p, uint cmp, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + if (old == cmp) *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_min(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + if (val < old) *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_min(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + if (val < old) *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_min(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + if (val < old) *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_min(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + if (val < old) *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_max(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + if (val > old) *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_max(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + if (val > old) *p = val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_max(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + if (val > old) *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_max(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + if (val > old) *p = val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_and(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = old & val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_and(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = old & val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_and(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = old & val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_and(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = old & val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_or(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = old | val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_or(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = old | val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_or(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = old | val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_or(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = old | val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + + +_CLC_OVERLOAD _CLC_DEF int atomic_xor(volatile global int* p, int val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + int old = *p; + *p = old ^ val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_xor(volatile global uint* p, uint val) +{ + INV_GLOBAL(p, sizeof(*p)); + LOCK_GLOBAL; + uint old = *p; + *p = old ^ val; + WB_GLOBAL(p, sizeof(*p)); + UNLOCK_GLOBAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF int atomic_xor(volatile local int* p, int val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + int old = *p; + *p = old ^ val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + +_CLC_OVERLOAD _CLC_DEF uint atomic_xor(volatile local uint* p, uint val) +{ + INV_LOCAL(p, sizeof(*p)); + LOCK_LOCAL; + uint old = *p; + *p = old ^ val; + WB_LOCAL(p, sizeof(*p)); + UNLOCK_LOCAL; + return old; +} + diff --git a/src/builtins/bitselect.cl b/src/builtins/bitselect.cl new file mode 100644 index 0000000..bf93a47 --- /dev/null +++ b/src/builtins/bitselect.cl @@ -0,0 +1,92 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define DEFN(tname) \ +_CLC_OVERLOAD _CLC_DEF tname bitselect(tname a, tname b, tname c) { return a^(c&(b^a)); } + +DEFN(char2) +DEFN(uchar2) +DEFN(long2) +DEFN(ulong2) + +DEFN(char3) +DEFN(uchar3) +DEFN(short3) +DEFN(ushort3) +DEFN(int3) +DEFN(uint3) +DEFN(long3) +DEFN(ulong3) + +DEFN(int4) +DEFN(uint4) +DEFN(long4) +DEFN(ulong4) + +DEFN(short8) +DEFN(ushort8) +DEFN(int8) +DEFN(uint8) +DEFN(long8) +DEFN(ulong8) + +DEFN(char16) +DEFN(uchar16) +DEFN(short16) +DEFN(ushort16) +DEFN(int16) +DEFN(uint16) +DEFN(long16) +DEFN(ulong16) + +_CLC_OVERLOAD _CLC_DEF float bitselect (float a, float b, float c) +{ return __builtin_astype(__builtin_astype(a,int)^(__builtin_astype(c,int)&(__builtin_astype(b,int)^__builtin_astype(a,int))), float); } +_CLC_OVERLOAD _CLC_DEF float2 bitselect (float2 a, float2 b, float2 c) +{ return __builtin_astype(__builtin_astype(a,int2)^(__builtin_astype(c,int2)&(__builtin_astype(b,int2)^__builtin_astype(a,int2))), float2); } +_CLC_OVERLOAD _CLC_DEF float3 bitselect (float3 a, float3 b, float3 c) +{ return __builtin_astype(__builtin_astype(a,int3)^(__builtin_astype(c,int3)&(__builtin_astype(b,int3)^__builtin_astype(a,int3))), float3); } +_CLC_OVERLOAD _CLC_DEF float4 bitselect (float4 a, float4 b, float4 c) +{ return __builtin_astype(__builtin_astype(a,int4)^(__builtin_astype(c,int4)&(__builtin_astype(b,int4)^__builtin_astype(a,int4))), float4); } +_CLC_OVERLOAD _CLC_DEF float8 bitselect (float8 a, float8 b, float8 c) +{ return __builtin_astype(__builtin_astype(a,int8)^(__builtin_astype(c,int8)&(__builtin_astype(b,int8)^__builtin_astype(a,int8))), float8); } +_CLC_OVERLOAD _CLC_DEF float16 bitselect (float16 a, float16 b, float16 c) +{ return __builtin_astype(__builtin_astype(a,int16)^(__builtin_astype(c,int16)&(__builtin_astype(b,int16)^__builtin_astype(a,int16))), float16); } + +_CLC_OVERLOAD _CLC_DEF double bitselect (double a, double b, double c) +{ return __builtin_astype(__builtin_astype(a,long)^(__builtin_astype(c,long)&(__builtin_astype(b,long)^__builtin_astype(a,long))), double); } +_CLC_OVERLOAD _CLC_DEF double2 bitselect (double2 a, double2 b, double2 c) +{ return __builtin_astype(__builtin_astype(a,long2)^(__builtin_astype(c,long2)&(__builtin_astype(b,long2)^__builtin_astype(a,long2))), double2); } +_CLC_OVERLOAD _CLC_DEF double3 bitselect (double3 a, double3 b, double3 c) +{ return __builtin_astype(__builtin_astype(a,long3)^(__builtin_astype(c,long3)&(__builtin_astype(b,long3)^__builtin_astype(a,long3))), double3); } +_CLC_OVERLOAD _CLC_DEF double4 bitselect (double4 a, double4 b, double4 c) +{ return __builtin_astype(__builtin_astype(a,long4)^(__builtin_astype(c,long4)&(__builtin_astype(b,long4)^__builtin_astype(a,long4))), double4); } +_CLC_OVERLOAD _CLC_DEF double8 bitselect (double8 a, double8 b, double8 c) +{ return __builtin_astype(__builtin_astype(a,long8)^(__builtin_astype(c,long8)&(__builtin_astype(b,long8)^__builtin_astype(a,long8))), double8); } +_CLC_OVERLOAD _CLC_DEF double16 bitselect (double16 a, double16 b, double16 c) +{ return __builtin_astype(__builtin_astype(a,long16)^(__builtin_astype(c,long16)&(__builtin_astype(b,long16)^__builtin_astype(a,long16))), double16); } diff --git a/src/builtins/clamp.cl b/src/builtins/clamp.cl new file mode 100644 index 0000000..78a29fb --- /dev/null +++ b/src/builtins/clamp.cl @@ -0,0 +1,43 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define EXPAND_SIZES(type) \ + IMPLEMENTATION (_VEC_TYPE(type,2), type) \ + IMPLEMENTATION (_VEC_TYPE(type,3), type) \ + IMPLEMENTATION (_VEC_TYPE(type,4), type) \ + IMPLEMENTATION (_VEC_TYPE(type,8), type) \ + IMPLEMENTATION (_VEC_TYPE(type,16), type) \ + +#define IMPLEMENTATION(gentype, sgentype) \ +_CLC_OVERLOAD _CLC_DEF gentype clamp(gentype x, gentype minval, gentype maxval) \ + { return x > maxval ? maxval : x < minval ? minval : x; } \ +_CLC_OVERLOAD _CLC_DEF gentype clamp(gentype x, sgentype minval, sgentype maxval) \ + { return x > (gentype)maxval ? (gentype)maxval : x < (gentype)minval ? (gentype)minval : x; } \ + +_EXPAND_TYPES() diff --git a/src/builtins/clz.cl b/src/builtins/clz.cl new file mode 100644 index 0000000..ac06119 --- /dev/null +++ b/src/builtins/clz.cl @@ -0,0 +1,37 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +UNARY_VEC_DEF(char, char, clz, clz) +UNARY_VEC_DEF(uchar, uchar, clz, clz) +UNARY_VEC_DEF(short, short, clz, clz) +UNARY_VEC_DEF(ushort, ushort,clz, clz) +UNARY_VEC_DEF(int, int, clz, clz) +UNARY_VEC_DEF(uint, uint, clz, clz) +UNARY_VEC_DEF(long, long, clz, clz) +UNARY_VEC_DEF(ulong, ulong, clz, clz) diff --git a/src/builtins/convert.cl b/src/builtins/convert.cl new file mode 100644 index 0000000..2f47c2d --- /dev/null +++ b/src/builtins/convert.cl @@ -0,0 +1,36122 @@ +/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!! + + DON'T CHANGE THIS FILE. MAKE YOUR CHANGES TO convert_type.py AND RUN: + $ ./generate-conversion-type-cl.sh + + OpenCL type conversion functions + + Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com> + Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com> + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "clc.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define cles_khr_int64 + +_CLC_DEF _CLC_OVERLOAD +char convert_char(char x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(char2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(char4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(char8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(char16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(char3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(char x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(char2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(char4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(char8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(char16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(char3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short(char x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(char2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(char4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(char8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(char16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(char3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(char x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(char2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(char4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(char8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(char16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(char3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int(char x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(char2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(char4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(char8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(char16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(char3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(char x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(char2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(char4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(char8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(char16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(char3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(char x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(char2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(char4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(char8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(char16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(char3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(char x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(char2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(char4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(char8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(char16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(char3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float(char x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(char2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(char4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(char8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(char16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(char3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(char x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(char2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(char4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(char8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(char16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(char3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char(uchar x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(uchar2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(uchar4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(uchar8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(uchar16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(uchar3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(uchar x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(uchar2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(uchar4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(uchar8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(uchar16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(uchar3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short(uchar x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(uchar2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(uchar4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(uchar8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(uchar16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(uchar3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(uchar x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(uchar2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(uchar4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(uchar8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(uchar16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(uchar3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int(uchar x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(uchar2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(uchar4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(uchar8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(uchar16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(uchar3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(uchar x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(uchar2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(uchar4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(uchar8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(uchar16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(uchar3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(uchar x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(uchar2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(uchar4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(uchar8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(uchar16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(uchar3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(uchar x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(uchar2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(uchar4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(uchar8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(uchar16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(uchar3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float(uchar x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(uchar2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(uchar4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(uchar8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(uchar16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(uchar3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(uchar x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(uchar2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(uchar4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(uchar8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(uchar16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(uchar3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char(short x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(short2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(short4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(short8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(short16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(short3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(short x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(short2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(short4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(short8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(short16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(short3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short(short x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(short2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(short4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(short8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(short16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(short3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(short x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(short2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(short4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(short8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(short16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(short3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int(short x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(short2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(short4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(short8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(short16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(short3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(short x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(short2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(short4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(short8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(short16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(short3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(short x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(short2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(short4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(short8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(short16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(short3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(short x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(short2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(short4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(short8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(short16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(short3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float(short x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(short2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(short4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(short8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(short16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(short3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(short x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(short2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(short4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(short8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(short16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(short3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char(ushort x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(ushort2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(ushort4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(ushort8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(ushort16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(ushort3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(ushort x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(ushort2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(ushort4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(ushort8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(ushort16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(ushort3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short(ushort x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(ushort2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(ushort4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(ushort8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(ushort16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(ushort3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(ushort x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(ushort2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(ushort4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(ushort8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(ushort16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(ushort3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int(ushort x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(ushort2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(ushort4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(ushort8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(ushort16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(ushort3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(ushort x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(ushort2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(ushort4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(ushort8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(ushort16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(ushort3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(ushort x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(ushort2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(ushort4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(ushort8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(ushort16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(ushort3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(ushort x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(ushort2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(ushort4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(ushort8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(ushort16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(ushort3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float(ushort x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(ushort2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(ushort4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(ushort8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(ushort16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(ushort3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(ushort x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(ushort2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(ushort4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(ushort8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(ushort16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(ushort3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char(int x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(int2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(int4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(int8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(int16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(int3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(int x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(int2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(int4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(int8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(int16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(int3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short(int x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(int2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(int4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(int8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(int16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(int3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(int x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(int2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(int4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(int8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(int16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(int3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int(int x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(int2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(int4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(int8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(int16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(int3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(int x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(int2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(int4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(int8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(int16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(int3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(int x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(int2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(int4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(int8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(int16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(int3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(int x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(int2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(int4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(int8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(int16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(int3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float(int x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(int2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(int4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(int8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(int16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(int3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(int x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(int2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(int4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(int8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(int16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(int3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char(uint x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(uint2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(uint4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(uint8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(uint16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(uint3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(uint x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(uint2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(uint4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(uint8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(uint16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(uint3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short(uint x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(uint2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(uint4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(uint8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(uint16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(uint3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(uint x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(uint2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(uint4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(uint8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(uint16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(uint3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int(uint x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(uint2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(uint4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(uint8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(uint16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(uint3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(uint x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(uint2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(uint4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(uint8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(uint16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(uint3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(uint x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(uint2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(uint4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(uint8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(uint16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(uint3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(uint x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(uint2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(uint4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(uint8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(uint16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(uint3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float(uint x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(uint2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(uint4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(uint8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(uint16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(uint3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(uint x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(uint2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(uint4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(uint8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(uint16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(uint3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char(long x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(long2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(long4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(long8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(long16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(long3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(long x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(long2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(long4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(long8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(long16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(long3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short(long x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(long2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(long4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(long8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(long16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(long3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(long x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(long2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(long4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(long8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(long16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(long3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int(long x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(long2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(long4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(long8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(long16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(long3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(long x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(long2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(long4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(long8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(long16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(long3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(long x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(long2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(long4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(long8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(long16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(long3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(long x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(long2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(long4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(long8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(long16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(long3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float(long x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(long2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(long4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(long8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(long16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(long3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double(long x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(long2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(long4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(long8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(long16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(long3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char(ulong x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(ulong2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(ulong4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(ulong8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(ulong16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(ulong3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(ulong x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(ulong2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(ulong4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(ulong8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(ulong16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(ulong3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short(ulong x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(ulong2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(ulong4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(ulong8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(ulong16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(ulong3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(ulong x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(ulong2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(ulong4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(ulong8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(ulong16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(ulong3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int(ulong x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(ulong2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(ulong4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(ulong8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(ulong16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(ulong3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(ulong x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(ulong2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(ulong4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(ulong8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(ulong16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(ulong3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(ulong x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(ulong2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(ulong4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(ulong8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(ulong16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(ulong3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(ulong x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(ulong2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(ulong4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(ulong8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(ulong16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(ulong3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float(ulong x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(ulong2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(ulong4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(ulong8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(ulong16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(ulong3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double(ulong x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(ulong2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(ulong4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(ulong8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(ulong16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(ulong3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char(float x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(float2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(float4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(float8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(float16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(float3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(float x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(float2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(float4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(float8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(float16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(float3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short(float x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(float2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(float4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(float8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(float16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(float3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(float x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(float2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(float4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(float8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(float16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(float3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int(float x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(float2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(float4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(float8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(float16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(float3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(float x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(float2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(float4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(float8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(float16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(float3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long(float x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(float2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(float4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(float8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(float16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(float3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(float x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(float2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(float4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(float8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(float16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(float3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float(float x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(float2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(float4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(float8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(float16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(float3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(float x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(float2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(float4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(float8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(float16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(float3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char(double x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2(double2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4(double4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8(double8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16(double16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3(double3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar(double x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2(double2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4(double4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8(double8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16(double16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3(double3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short(double x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2(double2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4(double4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8(double8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16(double16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3(double3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort(double x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2(double2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4(double4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8(double8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16(double16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3(double3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int(double x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2(double2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4(double4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8(double8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16(double16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3(double3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint(double x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2(double2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4(double4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8(double8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16(double16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3(double3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long(double x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2(double2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4(double4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8(double8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16(double16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3(double3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong(double x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2(double2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4(double4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8(double8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16(double16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3(double3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float convert_float(double x) +{ + return (float)x; +} + +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2(double2 x) +{ + return (float2)(convert_float(x.lo), convert_float(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4(double4 x) +{ + return (float4)(convert_float2(x.lo), convert_float2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8(double8 x) +{ + return (float8)(convert_float4(x.lo), convert_float4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16(double16 x) +{ + return (float16)(convert_float8(x.lo), convert_float8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3(double3 x) +{ + return (float3)(convert_float2(x.s01), convert_float(x.s2)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double(double x) +{ + return (double)x; +} + +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2(double2 x) +{ + return (double2)(convert_double(x.lo), convert_double(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4(double4 x) +{ + return (double4)(convert_double2(x.lo), convert_double2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8(double8 x) +{ + return (double8)(convert_double4(x.lo), convert_double4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16(double16 x) +{ + return (double16)(convert_double8(x.lo), convert_double8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3(double3 x) +{ + return (double3)(convert_double2(x.s01), convert_double(x.s2)); +} +#endif + + +#if 0 // ASW + +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(char x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(char2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(char4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(char8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(char16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(char3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(char x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(char2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(char4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(char8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(char16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(char3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(char x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(char2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(char4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(char8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(char16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(char3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(char x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(char2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(char4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(char8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(char16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(char3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(char x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(char2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(char4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(char8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(char16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(char3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(char x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(char2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(char4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(char8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(char16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(char3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(char x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(char2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(char4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(char8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(char16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(char3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(char x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(char2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(char4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(char8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(char16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(char3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(char x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(char2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(char4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(char8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(char16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(char3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(char x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(char2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(char4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(char8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(char16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(char3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(char x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(char2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(char4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(char8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(char16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(char3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(char x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(char2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(char4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(char8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(char16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(char3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(char x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(char2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(char4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(char8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(char16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(char3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(char x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(char2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(char4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(char8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(char16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(char3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(char x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(char2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(char4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(char8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(char16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(char3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(char x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(char2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(char4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(char8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(char16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(char3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(char x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(char2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(char4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(char8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(char16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(char3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(char x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(char2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(char4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(char8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(char16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(char3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(char x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(char2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(char4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(char8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(char16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(char3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(char x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(char2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(char4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(char8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(char16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(char3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(char x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(char2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(char4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(char8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(char16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(char3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(char x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(char2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(char4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(char8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(char16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(char3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(char x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(char2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(char4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(char8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(char16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(char3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(char x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(char2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(char4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(char8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(char16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(char3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(char x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(char2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(char4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(char8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(char16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(char3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(char x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(char2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(char4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(char8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(char16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(char3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(char x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(char2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(char4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(char8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(char16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(char3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(char x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(char2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(char4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(char8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(char16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(char3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(char x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(char2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(char4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(char8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(char16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(char3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(char x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(char2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(char4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(char8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(char16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(char3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(char x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(char2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(char4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(char8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(char16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(char3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(char x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(char2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(char4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(char8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(char16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(char3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(uchar x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(uchar2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(uchar4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(uchar8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(uchar16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(uchar3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(uchar x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(uchar2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(uchar4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(uchar8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(uchar16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(uchar3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(uchar x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(uchar2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(uchar4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(uchar8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(uchar16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(uchar3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(uchar x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(uchar2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(uchar4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(uchar8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(uchar16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(uchar3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(uchar x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(uchar2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(uchar4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(uchar8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(uchar16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(uchar3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(uchar x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(uchar2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(uchar4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(uchar8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(uchar16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(uchar3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(uchar x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(uchar2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(uchar4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(uchar8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(uchar16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(uchar3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(uchar x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(uchar2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(uchar4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(uchar8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(uchar16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(uchar3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(uchar x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(uchar2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(uchar4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(uchar8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(uchar16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(uchar3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(uchar x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(uchar2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(uchar4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(uchar8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(uchar16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(uchar3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(uchar x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(uchar2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(uchar4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(uchar8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(uchar16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(uchar3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(uchar x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(uchar2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(uchar4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(uchar8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(uchar16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(uchar3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(uchar x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(uchar2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(uchar4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(uchar8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(uchar16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(uchar3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(uchar x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(uchar2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(uchar4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(uchar8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(uchar16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(uchar3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(uchar x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(uchar2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(uchar4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(uchar8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(uchar16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(uchar3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(uchar x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(uchar2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(uchar4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(uchar8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(uchar16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(uchar3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(uchar x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(uchar2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(uchar4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(uchar8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(uchar16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(uchar3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(uchar x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(uchar2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(uchar4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(uchar8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(uchar16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(uchar3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(uchar x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(uchar2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(uchar4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(uchar8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(uchar16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(uchar3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(uchar x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(uchar2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(uchar4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(uchar8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(uchar16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(uchar3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(uchar x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(uchar2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(uchar4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(uchar8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(uchar16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(uchar3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(uchar x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(uchar2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(uchar4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(uchar8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(uchar16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(uchar3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(uchar x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(uchar2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(uchar4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(uchar8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(uchar16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(uchar3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(uchar x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(uchar2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(uchar4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(uchar8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(uchar16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(uchar3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(uchar x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(uchar2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(uchar4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(uchar8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(uchar16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(uchar3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(uchar x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(uchar2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(uchar4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(uchar8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(uchar16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(uchar3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(uchar x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(uchar2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(uchar4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(uchar8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(uchar16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(uchar3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(uchar x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(uchar2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(uchar4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(uchar8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(uchar16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(uchar3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(uchar x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(uchar2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(uchar4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(uchar8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(uchar16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(uchar3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(uchar x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(uchar2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(uchar4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(uchar8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(uchar16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(uchar3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(uchar x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(uchar2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(uchar4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(uchar8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(uchar16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(uchar3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(uchar x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(uchar2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(uchar4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(uchar8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(uchar16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(uchar3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(short x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(short2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(short4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(short8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(short16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(short3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(short x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(short2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(short4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(short8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(short16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(short3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(short x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(short2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(short4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(short8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(short16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(short3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(short x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(short2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(short4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(short8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(short16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(short3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(short x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(short2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(short4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(short8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(short16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(short3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(short x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(short2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(short4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(short8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(short16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(short3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(short x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(short2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(short4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(short8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(short16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(short3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(short x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(short2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(short4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(short8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(short16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(short3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(short x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(short2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(short4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(short8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(short16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(short3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(short x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(short2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(short4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(short8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(short16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(short3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(short x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(short2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(short4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(short8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(short16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(short3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(short x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(short2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(short4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(short8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(short16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(short3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(short x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(short2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(short4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(short8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(short16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(short3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(short x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(short2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(short4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(short8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(short16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(short3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(short x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(short2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(short4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(short8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(short16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(short3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(short x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(short2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(short4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(short8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(short16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(short3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(short x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(short2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(short4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(short8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(short16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(short3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(short x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(short2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(short4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(short8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(short16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(short3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(short x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(short2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(short4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(short8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(short16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(short3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(short x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(short2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(short4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(short8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(short16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(short3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(short x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(short2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(short4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(short8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(short16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(short3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(short x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(short2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(short4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(short8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(short16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(short3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(short x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(short2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(short4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(short8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(short16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(short3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(short x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(short2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(short4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(short8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(short16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(short3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(short x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(short2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(short4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(short8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(short16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(short3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(short x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(short2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(short4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(short8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(short16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(short3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(short x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(short2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(short4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(short8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(short16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(short3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(short x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(short2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(short4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(short8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(short16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(short3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(short x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(short2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(short4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(short8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(short16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(short3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(short x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(short2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(short4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(short8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(short16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(short3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(short x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(short2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(short4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(short8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(short16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(short3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(short x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(short2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(short4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(short8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(short16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(short3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(ushort x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(ushort2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(ushort4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(ushort8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(ushort16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(ushort3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(ushort x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(ushort2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(ushort4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(ushort8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(ushort16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(ushort3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(ushort x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(ushort2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(ushort4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(ushort8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(ushort16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(ushort3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(ushort x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(ushort2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(ushort4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(ushort8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(ushort16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(ushort3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(ushort x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(ushort2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(ushort4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(ushort8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(ushort16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(ushort3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(ushort x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(ushort2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(ushort4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(ushort8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(ushort16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(ushort3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(ushort x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(ushort2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(ushort4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(ushort8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(ushort16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(ushort3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(ushort x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(ushort2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(ushort4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(ushort8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(ushort16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(ushort3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(ushort x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(ushort2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(ushort4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(ushort8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(ushort16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(ushort3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(ushort x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(ushort2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(ushort4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(ushort8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(ushort16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(ushort3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(ushort x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(ushort2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(ushort4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(ushort8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(ushort16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(ushort3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(ushort x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(ushort2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(ushort4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(ushort8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(ushort16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(ushort3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(ushort x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(ushort2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(ushort4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(ushort8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(ushort16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(ushort3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(ushort x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(ushort2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(ushort4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(ushort8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(ushort16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(ushort3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(ushort x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(ushort2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(ushort4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(ushort8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(ushort16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(ushort3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(ushort x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(ushort2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(ushort4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(ushort8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(ushort16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(ushort3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(ushort x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(ushort2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(ushort4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(ushort8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(ushort16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(ushort3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(ushort x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(ushort2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(ushort4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(ushort8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(ushort16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(ushort3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(ushort x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(ushort2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(ushort4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(ushort8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(ushort16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(ushort3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(ushort x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(ushort2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(ushort4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(ushort8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(ushort16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(ushort3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(ushort x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(ushort2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(ushort4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(ushort8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(ushort16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(ushort3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(ushort x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(ushort2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(ushort4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(ushort8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(ushort16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(ushort3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(ushort x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(ushort2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(ushort4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(ushort8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(ushort16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(ushort3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(ushort x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(ushort2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(ushort4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(ushort8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(ushort16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(ushort3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(ushort x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(ushort2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(ushort4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(ushort8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(ushort16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(ushort3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(ushort x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(ushort2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(ushort4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(ushort8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(ushort16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(ushort3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(ushort x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(ushort2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(ushort4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(ushort8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(ushort16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(ushort3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(ushort x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(ushort2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(ushort4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(ushort8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(ushort16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(ushort3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(ushort x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(ushort2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(ushort4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(ushort8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(ushort16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(ushort3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(ushort x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(ushort2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(ushort4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(ushort8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(ushort16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(ushort3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(ushort x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(ushort2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(ushort4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(ushort8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(ushort16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(ushort3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(ushort x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(ushort2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(ushort4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(ushort8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(ushort16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(ushort3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(int x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(int2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(int4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(int8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(int16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(int3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(int x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(int2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(int4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(int8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(int16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(int3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(int x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(int2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(int4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(int8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(int16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(int3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(int x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(int2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(int4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(int8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(int16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(int3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(int x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(int2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(int4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(int8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(int16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(int3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(int x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(int2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(int4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(int8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(int16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(int3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(int x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(int2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(int4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(int8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(int16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(int3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(int x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(int2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(int4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(int8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(int16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(int3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(int x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(int2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(int4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(int8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(int16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(int3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(int x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(int2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(int4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(int8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(int16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(int3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(int x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(int2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(int4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(int8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(int16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(int3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(int x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(int2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(int4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(int8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(int16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(int3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(int x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(int2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(int4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(int8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(int16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(int3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(int x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(int2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(int4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(int8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(int16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(int3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(int x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(int2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(int4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(int8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(int16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(int3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(int x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(int2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(int4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(int8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(int16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(int3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(int x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(int2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(int4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(int8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(int16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(int3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(int x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(int2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(int4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(int8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(int16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(int3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(int x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(int2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(int4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(int8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(int16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(int3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(int x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(int2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(int4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(int8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(int16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(int3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(int x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(int2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(int4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(int8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(int16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(int3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(int x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(int2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(int4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(int8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(int16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(int3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(int x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(int2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(int4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(int8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(int16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(int3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(int x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(int2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(int4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(int8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(int16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(int3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(int x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(int2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(int4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(int8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(int16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(int3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(int x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(int2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(int4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(int8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(int16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(int3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(int x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(int2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(int4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(int8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(int16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(int3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(int x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(int2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(int4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(int8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(int16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(int3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(int x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(int2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(int4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(int8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(int16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(int3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(int x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(int2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(int4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(int8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(int16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(int3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(int x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(int2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(int4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(int8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(int16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(int3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(int x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(int2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(int4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(int8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(int16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(int3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(uint x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(uint2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(uint4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(uint8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(uint16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(uint3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(uint x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(uint2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(uint4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(uint8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(uint16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(uint3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(uint x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(uint2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(uint4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(uint8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(uint16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(uint3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(uint x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(uint2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(uint4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(uint8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(uint16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(uint3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(uint x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(uint2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(uint4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(uint8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(uint16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(uint3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(uint x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(uint2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(uint4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(uint8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(uint16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(uint3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(uint x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(uint2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(uint4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(uint8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(uint16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(uint3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(uint x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(uint2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(uint4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(uint8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(uint16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(uint3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(uint x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(uint2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(uint4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(uint8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(uint16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(uint3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(uint x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(uint2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(uint4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(uint8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(uint16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(uint3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(uint x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(uint2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(uint4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(uint8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(uint16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(uint3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(uint x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(uint2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(uint4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(uint8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(uint16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(uint3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(uint x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(uint2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(uint4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(uint8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(uint16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(uint3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(uint x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(uint2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(uint4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(uint8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(uint16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(uint3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(uint x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(uint2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(uint4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(uint8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(uint16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(uint3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(uint x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(uint2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(uint4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(uint8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(uint16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(uint3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(uint x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(uint2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(uint4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(uint8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(uint16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(uint3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(uint x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(uint2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(uint4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(uint8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(uint16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(uint3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(uint x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(uint2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(uint4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(uint8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(uint16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(uint3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(uint x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(uint2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(uint4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(uint8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(uint16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(uint3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(uint x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(uint2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(uint4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(uint8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(uint16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(uint3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(uint x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(uint2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(uint4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(uint8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(uint16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(uint3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(uint x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(uint2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(uint4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(uint8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(uint16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(uint3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(uint x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(uint2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(uint4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(uint8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(uint16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(uint3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(uint x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(uint2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(uint4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(uint8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(uint16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(uint3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(uint x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(uint2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(uint4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(uint8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(uint16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(uint3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(uint x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(uint2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(uint4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(uint8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(uint16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(uint3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(uint x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(uint2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(uint4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(uint8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(uint16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(uint3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(uint x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(uint2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(uint4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(uint8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(uint16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(uint3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(uint x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(uint2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(uint4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(uint8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(uint16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(uint3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(uint x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(uint2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(uint4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(uint8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(uint16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(uint3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(uint x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(uint2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(uint4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(uint8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(uint16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(uint3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(long x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(long2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(long4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(long8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(long16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(long3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(long x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(long2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(long4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(long8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(long16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(long3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(long x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(long2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(long4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(long8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(long16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(long3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(long x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(long2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(long4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(long8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(long16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(long3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(long x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(long2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(long4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(long8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(long16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(long3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(long x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(long2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(long4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(long8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(long16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(long3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(long x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(long2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(long4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(long8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(long16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(long3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(long x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(long2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(long4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(long8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(long16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(long3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(long x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(long2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(long4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(long8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(long16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(long3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(long x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(long2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(long4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(long8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(long16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(long3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(long x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(long2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(long4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(long8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(long16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(long3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(long x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(long2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(long4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(long8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(long16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(long3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(long x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(long2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(long4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(long8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(long16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(long3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(long x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(long2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(long4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(long8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(long16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(long3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(long x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(long2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(long4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(long8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(long16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(long3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(long x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(long2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(long4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(long8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(long16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(long3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(long x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(long2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(long4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(long8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(long16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(long3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(long x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(long2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(long4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(long8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(long16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(long3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(long x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(long2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(long4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(long8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(long16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(long3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(long x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(long2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(long4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(long8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(long16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(long3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(long x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(long2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(long4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(long8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(long16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(long3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(long x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(long2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(long4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(long8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(long16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(long3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(long x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(long2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(long4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(long8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(long16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(long3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(long x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(long2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(long4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(long8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(long16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(long3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(long x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(long2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(long4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(long8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(long16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(long3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(long x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(long2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(long4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(long8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(long16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(long3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(long x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(long2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(long4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(long8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(long16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(long3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(long x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(long2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(long4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(long8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(long16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(long3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(long x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(long2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(long4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(long8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(long16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(long3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(long x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(long2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(long4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(long8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(long16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(long3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(long x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(long2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(long4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(long8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(long16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(long3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(long x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(long2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(long4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(long8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(long16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(long3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(ulong x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(ulong2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(ulong4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(ulong8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(ulong16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(ulong3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(ulong x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(ulong2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(ulong4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(ulong8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(ulong16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(ulong3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(ulong x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(ulong2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(ulong4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(ulong8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(ulong16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(ulong3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(ulong x) +{ + return (char)x; +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(ulong2 x) +{ + return (char2)(convert_char(x.lo), convert_char(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(ulong4 x) +{ + return (char4)(convert_char2(x.lo), convert_char2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(ulong8 x) +{ + return (char8)(convert_char4(x.lo), convert_char4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(ulong16 x) +{ + return (char16)(convert_char8(x.lo), convert_char8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(ulong3 x) +{ + return (char3)(convert_char2(x.s01), convert_char(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(ulong x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(ulong2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(ulong4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(ulong8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(ulong16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(ulong3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(ulong x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(ulong2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(ulong4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(ulong8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(ulong16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(ulong3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(ulong x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(ulong2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(ulong4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(ulong8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(ulong16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(ulong3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(ulong x) +{ + return (uchar)x; +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(ulong2 x) +{ + return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(ulong4 x) +{ + return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(ulong8 x) +{ + return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(ulong16 x) +{ + return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(ulong3 x) +{ + return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(ulong x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(ulong2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(ulong4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(ulong8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(ulong16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(ulong3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(ulong x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(ulong2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(ulong4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(ulong8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(ulong16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(ulong3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(ulong x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(ulong2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(ulong4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(ulong8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(ulong16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(ulong3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(ulong x) +{ + return (short)x; +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(ulong2 x) +{ + return (short2)(convert_short(x.lo), convert_short(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(ulong4 x) +{ + return (short4)(convert_short2(x.lo), convert_short2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(ulong8 x) +{ + return (short8)(convert_short4(x.lo), convert_short4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(ulong16 x) +{ + return (short16)(convert_short8(x.lo), convert_short8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(ulong3 x) +{ + return (short3)(convert_short2(x.s01), convert_short(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(ulong x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(ulong2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(ulong4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(ulong8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(ulong16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(ulong3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(ulong x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(ulong2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(ulong4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(ulong8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(ulong16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(ulong3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(ulong x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(ulong2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(ulong4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(ulong8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(ulong16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(ulong3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(ulong x) +{ + return (ushort)x; +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(ulong2 x) +{ + return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(ulong4 x) +{ + return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(ulong8 x) +{ + return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(ulong16 x) +{ + return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(ulong3 x) +{ + return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(ulong x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(ulong2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(ulong4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(ulong8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(ulong16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(ulong3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(ulong x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(ulong2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(ulong4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(ulong8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(ulong16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(ulong3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(ulong x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(ulong2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(ulong4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(ulong8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(ulong16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(ulong3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(ulong x) +{ + return (int)x; +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(ulong2 x) +{ + return (int2)(convert_int(x.lo), convert_int(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(ulong4 x) +{ + return (int4)(convert_int2(x.lo), convert_int2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(ulong8 x) +{ + return (int8)(convert_int4(x.lo), convert_int4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(ulong16 x) +{ + return (int16)(convert_int8(x.lo), convert_int8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(ulong3 x) +{ + return (int3)(convert_int2(x.s01), convert_int(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(ulong x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(ulong2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(ulong4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(ulong8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(ulong16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(ulong3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(ulong x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(ulong2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(ulong4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(ulong8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(ulong16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(ulong3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(ulong x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(ulong2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(ulong4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(ulong8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(ulong16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(ulong3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(ulong x) +{ + return (uint)x; +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(ulong2 x) +{ + return (uint2)(convert_uint(x.lo), convert_uint(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(ulong4 x) +{ + return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(ulong8 x) +{ + return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(ulong16 x) +{ + return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(ulong3 x) +{ + return (uint3)(convert_uint2(x.s01), convert_uint(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(ulong x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(ulong2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(ulong4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(ulong8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(ulong16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(ulong3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(ulong x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(ulong2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(ulong4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(ulong8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(ulong16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(ulong3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(ulong x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(ulong2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(ulong4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(ulong8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(ulong16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(ulong3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(ulong x) +{ + return (long)x; +} + +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(ulong2 x) +{ + return (long2)(convert_long(x.lo), convert_long(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(ulong4 x) +{ + return (long4)(convert_long2(x.lo), convert_long2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(ulong8 x) +{ + return (long8)(convert_long4(x.lo), convert_long4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(ulong16 x) +{ + return (long16)(convert_long8(x.lo), convert_long8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(ulong3 x) +{ + return (long3)(convert_long2(x.s01), convert_long(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(ulong x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(ulong2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(ulong4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(ulong8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(ulong16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(ulong3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(ulong x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(ulong2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(ulong4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(ulong8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(ulong16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(ulong3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(ulong x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(ulong2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(ulong4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(ulong8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(ulong16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(ulong3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(ulong x) +{ + return (ulong)x; +} + +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(ulong2 x) +{ + return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(ulong4 x) +{ + return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(ulong8 x) +{ + return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(ulong16 x) +{ + return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi)); +} + +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(ulong3 x) +{ + return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2)); +t +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(char x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(char2 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(char3 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(char4 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(char8 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(char16 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(char x) +{ + x = max(x, (char)0); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(char2 x) +{ + x = max(x, (char)0); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(char3 x) +{ + x = max(x, (char)0); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(char4 x) +{ + x = max(x, (char)0); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(char8 x) +{ + x = max(x, (char)0); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(char16 x) +{ + x = max(x, (char)0); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(char x) +{ + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(char2 x) +{ + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(char3 x) +{ + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(char4 x) +{ + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(char8 x) +{ + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(char16 x) +{ + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(char x) +{ + x = max(x, (char)0); + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(char2 x) +{ + x = max(x, (char)0); + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(char3 x) +{ + x = max(x, (char)0); + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(char4 x) +{ + x = max(x, (char)0); + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(char8 x) +{ + x = max(x, (char)0); + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(char16 x) +{ + x = max(x, (char)0); + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(char x) +{ + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(char2 x) +{ + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(char3 x) +{ + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(char4 x) +{ + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(char8 x) +{ + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(char16 x) +{ + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(char x) +{ + x = max(x, (char)0); + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(char2 x) +{ + x = max(x, (char)0); + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(char3 x) +{ + x = max(x, (char)0); + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(char4 x) +{ + x = max(x, (char)0); + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(char8 x) +{ + x = max(x, (char)0); + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(char16 x) +{ + x = max(x, (char)0); + return convert_uint16(x); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(char x) +{ + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(char2 x) +{ + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(char3 x) +{ + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(char4 x) +{ + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(char8 x) +{ + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(char16 x) +{ + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(char x) +{ + x = max(x, (char)0); + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(char2 x) +{ + x = max(x, (char)0); + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(char3 x) +{ + x = max(x, (char)0); + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(char4 x) +{ + x = max(x, (char)0); + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(char8 x) +{ + x = max(x, (char)0); + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(char16 x) +{ + x = max(x, (char)0); + return convert_ulong16(x); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(uchar x) +{ + x = min(x, (uchar)CHAR_MAX); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(uchar2 x) +{ + x = min(x, (uchar)CHAR_MAX); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(uchar3 x) +{ + x = min(x, (uchar)CHAR_MAX); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(uchar4 x) +{ + x = min(x, (uchar)CHAR_MAX); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(uchar8 x) +{ + x = min(x, (uchar)CHAR_MAX); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(uchar16 x) +{ + x = min(x, (uchar)CHAR_MAX); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(uchar x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(uchar2 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(uchar3 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(uchar4 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(uchar8 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(uchar16 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(uchar x) +{ + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(uchar2 x) +{ + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(uchar3 x) +{ + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(uchar4 x) +{ + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(uchar8 x) +{ + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(uchar16 x) +{ + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(uchar x) +{ + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(uchar2 x) +{ + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(uchar3 x) +{ + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(uchar4 x) +{ + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(uchar8 x) +{ + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(uchar16 x) +{ + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(uchar x) +{ + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(uchar2 x) +{ + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(uchar3 x) +{ + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(uchar4 x) +{ + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(uchar8 x) +{ + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(uchar16 x) +{ + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(uchar x) +{ + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(uchar2 x) +{ + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(uchar3 x) +{ + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(uchar4 x) +{ + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(uchar8 x) +{ + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(uchar16 x) +{ + return convert_uint16(x); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(uchar x) +{ + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(uchar2 x) +{ + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(uchar3 x) +{ + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(uchar4 x) +{ + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(uchar8 x) +{ + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(uchar16 x) +{ + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(uchar x) +{ + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(uchar2 x) +{ + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(uchar3 x) +{ + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(uchar4 x) +{ + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(uchar8 x) +{ + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(uchar16 x) +{ + return convert_ulong16(x); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(short x) +{ + x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(short2 x) +{ + x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(short3 x) +{ + x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(short4 x) +{ + x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(short8 x) +{ + x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(short16 x) +{ + x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(short x) +{ + x = clamp(x, (short)0, (short)UCHAR_MAX); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(short2 x) +{ + x = clamp(x, (short)0, (short)UCHAR_MAX); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(short3 x) +{ + x = clamp(x, (short)0, (short)UCHAR_MAX); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(short4 x) +{ + x = clamp(x, (short)0, (short)UCHAR_MAX); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(short8 x) +{ + x = clamp(x, (short)0, (short)UCHAR_MAX); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(short16 x) +{ + x = clamp(x, (short)0, (short)UCHAR_MAX); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(short x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(short2 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(short3 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(short4 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(short8 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(short16 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(short x) +{ + x = max(x, (short)0); + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(short2 x) +{ + x = max(x, (short)0); + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(short3 x) +{ + x = max(x, (short)0); + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(short4 x) +{ + x = max(x, (short)0); + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(short8 x) +{ + x = max(x, (short)0); + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(short16 x) +{ + x = max(x, (short)0); + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(short x) +{ + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(short2 x) +{ + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(short3 x) +{ + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(short4 x) +{ + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(short8 x) +{ + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(short16 x) +{ + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(short x) +{ + x = max(x, (short)0); + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(short2 x) +{ + x = max(x, (short)0); + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(short3 x) +{ + x = max(x, (short)0); + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(short4 x) +{ + x = max(x, (short)0); + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(short8 x) +{ + x = max(x, (short)0); + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(short16 x) +{ + x = max(x, (short)0); + return convert_uint16(x); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(short x) +{ + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(short2 x) +{ + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(short3 x) +{ + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(short4 x) +{ + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(short8 x) +{ + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(short16 x) +{ + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(short x) +{ + x = max(x, (short)0); + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(short2 x) +{ + x = max(x, (short)0); + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(short3 x) +{ + x = max(x, (short)0); + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(short4 x) +{ + x = max(x, (short)0); + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(short8 x) +{ + x = max(x, (short)0); + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(short16 x) +{ + x = max(x, (short)0); + return convert_ulong16(x); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(ushort x) +{ + x = min(x, (ushort)CHAR_MAX); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(ushort2 x) +{ + x = min(x, (ushort)CHAR_MAX); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(ushort3 x) +{ + x = min(x, (ushort)CHAR_MAX); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(ushort4 x) +{ + x = min(x, (ushort)CHAR_MAX); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(ushort8 x) +{ + x = min(x, (ushort)CHAR_MAX); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(ushort16 x) +{ + x = min(x, (ushort)CHAR_MAX); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(ushort x) +{ + x = min(x, (ushort)UCHAR_MAX); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(ushort2 x) +{ + x = min(x, (ushort)UCHAR_MAX); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(ushort3 x) +{ + x = min(x, (ushort)UCHAR_MAX); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(ushort4 x) +{ + x = min(x, (ushort)UCHAR_MAX); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(ushort8 x) +{ + x = min(x, (ushort)UCHAR_MAX); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(ushort16 x) +{ + x = min(x, (ushort)UCHAR_MAX); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(ushort x) +{ + x = min(x, (ushort)SHRT_MAX); + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(ushort2 x) +{ + x = min(x, (ushort)SHRT_MAX); + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(ushort3 x) +{ + x = min(x, (ushort)SHRT_MAX); + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(ushort4 x) +{ + x = min(x, (ushort)SHRT_MAX); + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(ushort8 x) +{ + x = min(x, (ushort)SHRT_MAX); + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(ushort16 x) +{ + x = min(x, (ushort)SHRT_MAX); + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(ushort x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(ushort2 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(ushort3 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(ushort4 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(ushort8 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(ushort16 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(ushort x) +{ + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(ushort2 x) +{ + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(ushort3 x) +{ + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(ushort4 x) +{ + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(ushort8 x) +{ + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(ushort16 x) +{ + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(ushort x) +{ + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(ushort2 x) +{ + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(ushort3 x) +{ + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(ushort4 x) +{ + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(ushort8 x) +{ + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(ushort16 x) +{ + return convert_uint16(x); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(ushort x) +{ + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(ushort2 x) +{ + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(ushort3 x) +{ + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(ushort4 x) +{ + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(ushort8 x) +{ + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(ushort16 x) +{ + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(ushort x) +{ + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(ushort2 x) +{ + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(ushort3 x) +{ + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(ushort4 x) +{ + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(ushort8 x) +{ + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(ushort16 x) +{ + return convert_ulong16(x); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(int x) +{ + x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(int2 x) +{ + x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(int3 x) +{ + x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(int4 x) +{ + x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(int8 x) +{ + x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(int16 x) +{ + x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(int x) +{ + x = clamp(x, (int)0, (int)UCHAR_MAX); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(int2 x) +{ + x = clamp(x, (int)0, (int)UCHAR_MAX); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(int3 x) +{ + x = clamp(x, (int)0, (int)UCHAR_MAX); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(int4 x) +{ + x = clamp(x, (int)0, (int)UCHAR_MAX); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(int8 x) +{ + x = clamp(x, (int)0, (int)UCHAR_MAX); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(int16 x) +{ + x = clamp(x, (int)0, (int)UCHAR_MAX); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(int x) +{ + x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX); + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(int2 x) +{ + x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX); + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(int3 x) +{ + x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX); + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(int4 x) +{ + x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX); + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(int8 x) +{ + x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX); + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(int16 x) +{ + x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX); + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(int x) +{ + x = clamp(x, (int)0, (int)USHRT_MAX); + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(int2 x) +{ + x = clamp(x, (int)0, (int)USHRT_MAX); + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(int3 x) +{ + x = clamp(x, (int)0, (int)USHRT_MAX); + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(int4 x) +{ + x = clamp(x, (int)0, (int)USHRT_MAX); + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(int8 x) +{ + x = clamp(x, (int)0, (int)USHRT_MAX); + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(int16 x) +{ + x = clamp(x, (int)0, (int)USHRT_MAX); + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(int x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(int2 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(int3 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(int4 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(int8 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(int16 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(int x) +{ + x = max(x, (int)0); + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(int2 x) +{ + x = max(x, (int)0); + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(int3 x) +{ + x = max(x, (int)0); + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(int4 x) +{ + x = max(x, (int)0); + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(int8 x) +{ + x = max(x, (int)0); + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(int16 x) +{ + x = max(x, (int)0); + return convert_uint16(x); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(int x) +{ + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(int2 x) +{ + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(int3 x) +{ + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(int4 x) +{ + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(int8 x) +{ + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(int16 x) +{ + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(int x) +{ + x = max(x, (int)0); + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(int2 x) +{ + x = max(x, (int)0); + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(int3 x) +{ + x = max(x, (int)0); + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(int4 x) +{ + x = max(x, (int)0); + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(int8 x) +{ + x = max(x, (int)0); + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(int16 x) +{ + x = max(x, (int)0); + return convert_ulong16(x); +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(uint x) +{ + x = min(x, (uint)CHAR_MAX); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(uint2 x) +{ + x = min(x, (uint)CHAR_MAX); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(uint3 x) +{ + x = min(x, (uint)CHAR_MAX); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(uint4 x) +{ + x = min(x, (uint)CHAR_MAX); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(uint8 x) +{ + x = min(x, (uint)CHAR_MAX); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(uint16 x) +{ + x = min(x, (uint)CHAR_MAX); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(uint x) +{ + x = min(x, (uint)UCHAR_MAX); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(uint2 x) +{ + x = min(x, (uint)UCHAR_MAX); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(uint3 x) +{ + x = min(x, (uint)UCHAR_MAX); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(uint4 x) +{ + x = min(x, (uint)UCHAR_MAX); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(uint8 x) +{ + x = min(x, (uint)UCHAR_MAX); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(uint16 x) +{ + x = min(x, (uint)UCHAR_MAX); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(uint x) +{ + x = min(x, (uint)SHRT_MAX); + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(uint2 x) +{ + x = min(x, (uint)SHRT_MAX); + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(uint3 x) +{ + x = min(x, (uint)SHRT_MAX); + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(uint4 x) +{ + x = min(x, (uint)SHRT_MAX); + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(uint8 x) +{ + x = min(x, (uint)SHRT_MAX); + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(uint16 x) +{ + x = min(x, (uint)SHRT_MAX); + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(uint x) +{ + x = min(x, (uint)USHRT_MAX); + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(uint2 x) +{ + x = min(x, (uint)USHRT_MAX); + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(uint3 x) +{ + x = min(x, (uint)USHRT_MAX); + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(uint4 x) +{ + x = min(x, (uint)USHRT_MAX); + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(uint8 x) +{ + x = min(x, (uint)USHRT_MAX); + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(uint16 x) +{ + x = min(x, (uint)USHRT_MAX); + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(uint x) +{ + x = min(x, (uint)INT_MAX); + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(uint2 x) +{ + x = min(x, (uint)INT_MAX); + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(uint3 x) +{ + x = min(x, (uint)INT_MAX); + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(uint4 x) +{ + x = min(x, (uint)INT_MAX); + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(uint8 x) +{ + x = min(x, (uint)INT_MAX); + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(uint16 x) +{ + x = min(x, (uint)INT_MAX); + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(uint x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(uint2 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(uint3 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(uint4 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(uint8 x) +{ + return x; +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(uint16 x) +{ + return x; +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(uint x) +{ + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(uint2 x) +{ + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(uint3 x) +{ + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(uint4 x) +{ + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(uint8 x) +{ + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(uint16 x) +{ + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(uint x) +{ + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(uint2 x) +{ + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(uint3 x) +{ + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(uint4 x) +{ + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(uint8 x) +{ + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(uint16 x) +{ + return convert_ulong16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(long x) +{ + x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX); + return convert_char(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(long2 x) +{ + x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX); + return convert_char2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(long3 x) +{ + x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX); + return convert_char3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(long4 x) +{ + x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX); + return convert_char4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(long8 x) +{ + x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX); + return convert_char8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(long16 x) +{ + x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX); + return convert_char16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(long x) +{ + x = clamp(x, (long)0, (long)UCHAR_MAX); + return convert_uchar(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(long2 x) +{ + x = clamp(x, (long)0, (long)UCHAR_MAX); + return convert_uchar2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(long3 x) +{ + x = clamp(x, (long)0, (long)UCHAR_MAX); + return convert_uchar3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(long4 x) +{ + x = clamp(x, (long)0, (long)UCHAR_MAX); + return convert_uchar4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(long8 x) +{ + x = clamp(x, (long)0, (long)UCHAR_MAX); + return convert_uchar8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(long16 x) +{ + x = clamp(x, (long)0, (long)UCHAR_MAX); + return convert_uchar16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(long x) +{ + x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX); + return convert_short(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(long2 x) +{ + x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX); + return convert_short2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(long3 x) +{ + x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX); + return convert_short3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(long4 x) +{ + x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX); + return convert_short4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(long8 x) +{ + x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX); + return convert_short8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(long16 x) +{ + x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX); + return convert_short16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(long x) +{ + x = clamp(x, (long)0, (long)USHRT_MAX); + return convert_ushort(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(long2 x) +{ + x = clamp(x, (long)0, (long)USHRT_MAX); + return convert_ushort2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(long3 x) +{ + x = clamp(x, (long)0, (long)USHRT_MAX); + return convert_ushort3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(long4 x) +{ + x = clamp(x, (long)0, (long)USHRT_MAX); + return convert_ushort4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(long8 x) +{ + x = clamp(x, (long)0, (long)USHRT_MAX); + return convert_ushort8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(long16 x) +{ + x = clamp(x, (long)0, (long)USHRT_MAX); + return convert_ushort16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(long x) +{ + x = clamp(x, (long)INT_MIN, (long)INT_MAX); + return convert_int(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(long2 x) +{ + x = clamp(x, (long)INT_MIN, (long)INT_MAX); + return convert_int2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(long3 x) +{ + x = clamp(x, (long)INT_MIN, (long)INT_MAX); + return convert_int3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(long4 x) +{ + x = clamp(x, (long)INT_MIN, (long)INT_MAX); + return convert_int4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(long8 x) +{ + x = clamp(x, (long)INT_MIN, (long)INT_MAX); + return convert_int8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(long16 x) +{ + x = clamp(x, (long)INT_MIN, (long)INT_MAX); + return convert_int16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(long x) +{ + x = clamp(x, (long)0, (long)UINT_MAX); + return convert_uint(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(long2 x) +{ + x = clamp(x, (long)0, (long)UINT_MAX); + return convert_uint2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(long3 x) +{ + x = clamp(x, (long)0, (long)UINT_MAX); + return convert_uint3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(long4 x) +{ + x = clamp(x, (long)0, (long)UINT_MAX); + return convert_uint4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(long8 x) +{ + x = clamp(x, (long)0, (long)UINT_MAX); + return convert_uint8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(long16 x) +{ + x = clamp(x, (long)0, (long)UINT_MAX); + return convert_uint16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(long x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(long2 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(long3 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(long4 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(long8 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(long16 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(long x) +{ + x = max(x, (long)0); + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(long2 x) +{ + x = max(x, (long)0); + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(long3 x) +{ + x = max(x, (long)0); + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(long4 x) +{ + x = max(x, (long)0); + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(long8 x) +{ + x = max(x, (long)0); + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(long16 x) +{ + x = max(x, (long)0); + return convert_ulong16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(ulong x) +{ + x = min(x, (ulong)CHAR_MAX); + return convert_char(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(ulong2 x) +{ + x = min(x, (ulong)CHAR_MAX); + return convert_char2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(ulong3 x) +{ + x = min(x, (ulong)CHAR_MAX); + return convert_char3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(ulong4 x) +{ + x = min(x, (ulong)CHAR_MAX); + return convert_char4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(ulong8 x) +{ + x = min(x, (ulong)CHAR_MAX); + return convert_char8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(ulong16 x) +{ + x = min(x, (ulong)CHAR_MAX); + return convert_char16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(ulong x) +{ + x = min(x, (ulong)UCHAR_MAX); + return convert_uchar(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(ulong2 x) +{ + x = min(x, (ulong)UCHAR_MAX); + return convert_uchar2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(ulong3 x) +{ + x = min(x, (ulong)UCHAR_MAX); + return convert_uchar3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(ulong4 x) +{ + x = min(x, (ulong)UCHAR_MAX); + return convert_uchar4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(ulong8 x) +{ + x = min(x, (ulong)UCHAR_MAX); + return convert_uchar8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(ulong16 x) +{ + x = min(x, (ulong)UCHAR_MAX); + return convert_uchar16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(ulong x) +{ + x = min(x, (ulong)SHRT_MAX); + return convert_short(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(ulong2 x) +{ + x = min(x, (ulong)SHRT_MAX); + return convert_short2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(ulong3 x) +{ + x = min(x, (ulong)SHRT_MAX); + return convert_short3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(ulong4 x) +{ + x = min(x, (ulong)SHRT_MAX); + return convert_short4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(ulong8 x) +{ + x = min(x, (ulong)SHRT_MAX); + return convert_short8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(ulong16 x) +{ + x = min(x, (ulong)SHRT_MAX); + return convert_short16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(ulong x) +{ + x = min(x, (ulong)USHRT_MAX); + return convert_ushort(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(ulong2 x) +{ + x = min(x, (ulong)USHRT_MAX); + return convert_ushort2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(ulong3 x) +{ + x = min(x, (ulong)USHRT_MAX); + return convert_ushort3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(ulong4 x) +{ + x = min(x, (ulong)USHRT_MAX); + return convert_ushort4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(ulong8 x) +{ + x = min(x, (ulong)USHRT_MAX); + return convert_ushort8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(ulong16 x) +{ + x = min(x, (ulong)USHRT_MAX); + return convert_ushort16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(ulong x) +{ + x = min(x, (ulong)INT_MAX); + return convert_int(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(ulong2 x) +{ + x = min(x, (ulong)INT_MAX); + return convert_int2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(ulong3 x) +{ + x = min(x, (ulong)INT_MAX); + return convert_int3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(ulong4 x) +{ + x = min(x, (ulong)INT_MAX); + return convert_int4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(ulong8 x) +{ + x = min(x, (ulong)INT_MAX); + return convert_int8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(ulong16 x) +{ + x = min(x, (ulong)INT_MAX); + return convert_int16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(ulong x) +{ + x = min(x, (ulong)UINT_MAX); + return convert_uint(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(ulong2 x) +{ + x = min(x, (ulong)UINT_MAX); + return convert_uint2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(ulong3 x) +{ + x = min(x, (ulong)UINT_MAX); + return convert_uint3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(ulong4 x) +{ + x = min(x, (ulong)UINT_MAX); + return convert_uint4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(ulong8 x) +{ + x = min(x, (ulong)UINT_MAX); + return convert_uint8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(ulong16 x) +{ + x = min(x, (ulong)UINT_MAX); + return convert_uint16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(ulong x) +{ + x = min(x, (ulong)LONG_MAX); + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(ulong2 x) +{ + x = min(x, (ulong)LONG_MAX); + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(ulong3 x) +{ + x = min(x, (ulong)LONG_MAX); + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(ulong4 x) +{ + x = min(x, (ulong)LONG_MAX); + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(ulong8 x) +{ + x = min(x, (ulong)LONG_MAX); + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(ulong16 x) +{ + x = min(x, (ulong)LONG_MAX); + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(ulong x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(ulong2 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(ulong3 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(ulong4 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(ulong8 x) +{ + return x; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(ulong16 x) +{ + return x; +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(float x) +{ + char y = convert_char(x); + y = select(y, (char)CHAR_MIN, convert_char(x < (float)CHAR_MIN)); + y = select(y, (char)CHAR_MAX, convert_char(x > (float)CHAR_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(float2 x) +{ + char2 y = convert_char2(x); + y = select(y, (char2)CHAR_MIN, convert_char2(x < (float2)CHAR_MIN)); + y = select(y, (char2)CHAR_MAX, convert_char2(x > (float2)CHAR_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(float3 x) +{ + char3 y = convert_char3(x); + y = select(y, (char3)CHAR_MIN, convert_char3(x < (float3)CHAR_MIN)); + y = select(y, (char3)CHAR_MAX, convert_char3(x > (float3)CHAR_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(float4 x) +{ + char4 y = convert_char4(x); + y = select(y, (char4)CHAR_MIN, convert_char4(x < (float4)CHAR_MIN)); + y = select(y, (char4)CHAR_MAX, convert_char4(x > (float4)CHAR_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(float8 x) +{ + char8 y = convert_char8(x); + y = select(y, (char8)CHAR_MIN, convert_char8(x < (float8)CHAR_MIN)); + y = select(y, (char8)CHAR_MAX, convert_char8(x > (float8)CHAR_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(float16 x) +{ + char16 y = convert_char16(x); + y = select(y, (char16)CHAR_MIN, convert_char16(x < (float16)CHAR_MIN)); + y = select(y, (char16)CHAR_MAX, convert_char16(x > (float16)CHAR_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(float x) +{ + uchar y = convert_uchar(x); + y = select(y, (uchar)0, as_uchar(convert_char(x < (float)0))); + y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (float)UCHAR_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(float2 x) +{ + uchar2 y = convert_uchar2(x); + y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (float2)0))); + y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (float2)UCHAR_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(float3 x) +{ + uchar3 y = convert_uchar3(x); + y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (float3)0))); + y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (float3)UCHAR_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(float4 x) +{ + uchar4 y = convert_uchar4(x); + y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (float4)0))); + y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (float4)UCHAR_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(float8 x) +{ + uchar8 y = convert_uchar8(x); + y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (float8)0))); + y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (float8)UCHAR_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(float16 x) +{ + uchar16 y = convert_uchar16(x); + y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (float16)0))); + y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (float16)UCHAR_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(float x) +{ + short y = convert_short(x); + y = select(y, (short)SHRT_MIN, convert_short(x < (float)SHRT_MIN)); + y = select(y, (short)SHRT_MAX, convert_short(x > (float)SHRT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(float2 x) +{ + short2 y = convert_short2(x); + y = select(y, (short2)SHRT_MIN, convert_short2(x < (float2)SHRT_MIN)); + y = select(y, (short2)SHRT_MAX, convert_short2(x > (float2)SHRT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(float3 x) +{ + short3 y = convert_short3(x); + y = select(y, (short3)SHRT_MIN, convert_short3(x < (float3)SHRT_MIN)); + y = select(y, (short3)SHRT_MAX, convert_short3(x > (float3)SHRT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(float4 x) +{ + short4 y = convert_short4(x); + y = select(y, (short4)SHRT_MIN, convert_short4(x < (float4)SHRT_MIN)); + y = select(y, (short4)SHRT_MAX, convert_short4(x > (float4)SHRT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(float8 x) +{ + short8 y = convert_short8(x); + y = select(y, (short8)SHRT_MIN, convert_short8(x < (float8)SHRT_MIN)); + y = select(y, (short8)SHRT_MAX, convert_short8(x > (float8)SHRT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(float16 x) +{ + short16 y = convert_short16(x); + y = select(y, (short16)SHRT_MIN, convert_short16(x < (float16)SHRT_MIN)); + y = select(y, (short16)SHRT_MAX, convert_short16(x > (float16)SHRT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(float x) +{ + ushort y = convert_ushort(x); + y = select(y, (ushort)0, as_ushort(convert_short(x < (float)0))); + y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (float)USHRT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(float2 x) +{ + ushort2 y = convert_ushort2(x); + y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (float2)0))); + y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (float2)USHRT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(float3 x) +{ + ushort3 y = convert_ushort3(x); + y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (float3)0))); + y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (float3)USHRT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(float4 x) +{ + ushort4 y = convert_ushort4(x); + y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (float4)0))); + y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (float4)USHRT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(float8 x) +{ + ushort8 y = convert_ushort8(x); + y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (float8)0))); + y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (float8)USHRT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(float16 x) +{ + ushort16 y = convert_ushort16(x); + y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (float16)0))); + y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (float16)USHRT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(float x) +{ + int y = convert_int(x); + y = select(y, (int)INT_MIN, convert_int(x < (float)INT_MIN)); + y = select(y, (int)INT_MAX, convert_int(x > (float)INT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(float2 x) +{ + int2 y = convert_int2(x); + y = select(y, (int2)INT_MIN, convert_int2(x < (float2)INT_MIN)); + y = select(y, (int2)INT_MAX, convert_int2(x > (float2)INT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(float3 x) +{ + int3 y = convert_int3(x); + y = select(y, (int3)INT_MIN, convert_int3(x < (float3)INT_MIN)); + y = select(y, (int3)INT_MAX, convert_int3(x > (float3)INT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(float4 x) +{ + int4 y = convert_int4(x); + y = select(y, (int4)INT_MIN, convert_int4(x < (float4)INT_MIN)); + y = select(y, (int4)INT_MAX, convert_int4(x > (float4)INT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(float8 x) +{ + int8 y = convert_int8(x); + y = select(y, (int8)INT_MIN, convert_int8(x < (float8)INT_MIN)); + y = select(y, (int8)INT_MAX, convert_int8(x > (float8)INT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(float16 x) +{ + int16 y = convert_int16(x); + y = select(y, (int16)INT_MIN, convert_int16(x < (float16)INT_MIN)); + y = select(y, (int16)INT_MAX, convert_int16(x > (float16)INT_MAX)); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(float x) +{ + uint y = convert_uint(x); + y = select(y, (uint)0, as_uint(convert_int(x < (float)0))); + y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (float)UINT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(float2 x) +{ + uint2 y = convert_uint2(x); + y = select(y, (uint2)0, as_uint2(convert_int2(x < (float2)0))); + y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (float2)UINT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(float3 x) +{ + uint3 y = convert_uint3(x); + y = select(y, (uint3)0, as_uint3(convert_int3(x < (float3)0))); + y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (float3)UINT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(float4 x) +{ + uint4 y = convert_uint4(x); + y = select(y, (uint4)0, as_uint4(convert_int4(x < (float4)0))); + y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (float4)UINT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(float8 x) +{ + uint8 y = convert_uint8(x); + y = select(y, (uint8)0, as_uint8(convert_int8(x < (float8)0))); + y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (float8)UINT_MAX))); + return y; +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(float16 x) +{ + uint16 y = convert_uint16(x); + y = select(y, (uint16)0, as_uint16(convert_int16(x < (float16)0))); + y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (float16)UINT_MAX))); + return y; +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(float x) +{ + long y = convert_long(x); + y = select(y, (long)LONG_MIN, convert_long(x < (float)LONG_MIN)); + y = select(y, (long)LONG_MAX, convert_long(x > (float)LONG_MAX)); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(float2 x) +{ + long2 y = convert_long2(x); + y = select(y, (long2)LONG_MIN, convert_long2(x < (float2)LONG_MIN)); + y = select(y, (long2)LONG_MAX, convert_long2(x > (float2)LONG_MAX)); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(float3 x) +{ + long3 y = convert_long3(x); + y = select(y, (long3)LONG_MIN, convert_long3(x < (float3)LONG_MIN)); + y = select(y, (long3)LONG_MAX, convert_long3(x > (float3)LONG_MAX)); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(float4 x) +{ + long4 y = convert_long4(x); + y = select(y, (long4)LONG_MIN, convert_long4(x < (float4)LONG_MIN)); + y = select(y, (long4)LONG_MAX, convert_long4(x > (float4)LONG_MAX)); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(float8 x) +{ + long8 y = convert_long8(x); + y = select(y, (long8)LONG_MIN, convert_long8(x < (float8)LONG_MIN)); + y = select(y, (long8)LONG_MAX, convert_long8(x > (float8)LONG_MAX)); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(float16 x) +{ + long16 y = convert_long16(x); + y = select(y, (long16)LONG_MIN, convert_long16(x < (float16)LONG_MIN)); + y = select(y, (long16)LONG_MAX, convert_long16(x > (float16)LONG_MAX)); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(float x) +{ + ulong y = convert_ulong(x); + y = select(y, (ulong)0, as_ulong(convert_long(x < (float)0))); + y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (float)ULONG_MAX))); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(float2 x) +{ + ulong2 y = convert_ulong2(x); + y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (float2)0))); + y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (float2)ULONG_MAX))); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(float3 x) +{ + ulong3 y = convert_ulong3(x); + y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (float3)0))); + y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (float3)ULONG_MAX))); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(float4 x) +{ + ulong4 y = convert_ulong4(x); + y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (float4)0))); + y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (float4)ULONG_MAX))); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(float8 x) +{ + ulong8 y = convert_ulong8(x); + y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (float8)0))); + y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (float8)ULONG_MAX))); + return y; +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(float16 x) +{ + ulong16 y = convert_ulong16(x); + y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (float16)0))); + y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (float16)ULONG_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat(double x) +{ + char y = convert_char(x); + y = select(y, (char)CHAR_MIN, convert_char(x < (double)CHAR_MIN)); + y = select(y, (char)CHAR_MAX, convert_char(x > (double)CHAR_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat(double2 x) +{ + char2 y = convert_char2(x); + y = select(y, (char2)CHAR_MIN, convert_char2(x < (double2)CHAR_MIN)); + y = select(y, (char2)CHAR_MAX, convert_char2(x > (double2)CHAR_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat(double3 x) +{ + char3 y = convert_char3(x); + y = select(y, (char3)CHAR_MIN, convert_char3(x < (double3)CHAR_MIN)); + y = select(y, (char3)CHAR_MAX, convert_char3(x > (double3)CHAR_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat(double4 x) +{ + char4 y = convert_char4(x); + y = select(y, (char4)CHAR_MIN, convert_char4(x < (double4)CHAR_MIN)); + y = select(y, (char4)CHAR_MAX, convert_char4(x > (double4)CHAR_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat(double8 x) +{ + char8 y = convert_char8(x); + y = select(y, (char8)CHAR_MIN, convert_char8(x < (double8)CHAR_MIN)); + y = select(y, (char8)CHAR_MAX, convert_char8(x > (double8)CHAR_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat(double16 x) +{ + char16 y = convert_char16(x); + y = select(y, (char16)CHAR_MIN, convert_char16(x < (double16)CHAR_MIN)); + y = select(y, (char16)CHAR_MAX, convert_char16(x > (double16)CHAR_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat(double x) +{ + uchar y = convert_uchar(x); + y = select(y, (uchar)0, as_uchar(convert_char(x < (double)0))); + y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (double)UCHAR_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat(double2 x) +{ + uchar2 y = convert_uchar2(x); + y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (double2)0))); + y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (double2)UCHAR_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat(double3 x) +{ + uchar3 y = convert_uchar3(x); + y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (double3)0))); + y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (double3)UCHAR_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat(double4 x) +{ + uchar4 y = convert_uchar4(x); + y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (double4)0))); + y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (double4)UCHAR_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat(double8 x) +{ + uchar8 y = convert_uchar8(x); + y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (double8)0))); + y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (double8)UCHAR_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat(double16 x) +{ + uchar16 y = convert_uchar16(x); + y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (double16)0))); + y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (double16)UCHAR_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat(double x) +{ + short y = convert_short(x); + y = select(y, (short)SHRT_MIN, convert_short(x < (double)SHRT_MIN)); + y = select(y, (short)SHRT_MAX, convert_short(x > (double)SHRT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat(double2 x) +{ + short2 y = convert_short2(x); + y = select(y, (short2)SHRT_MIN, convert_short2(x < (double2)SHRT_MIN)); + y = select(y, (short2)SHRT_MAX, convert_short2(x > (double2)SHRT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat(double3 x) +{ + short3 y = convert_short3(x); + y = select(y, (short3)SHRT_MIN, convert_short3(x < (double3)SHRT_MIN)); + y = select(y, (short3)SHRT_MAX, convert_short3(x > (double3)SHRT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat(double4 x) +{ + short4 y = convert_short4(x); + y = select(y, (short4)SHRT_MIN, convert_short4(x < (double4)SHRT_MIN)); + y = select(y, (short4)SHRT_MAX, convert_short4(x > (double4)SHRT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat(double8 x) +{ + short8 y = convert_short8(x); + y = select(y, (short8)SHRT_MIN, convert_short8(x < (double8)SHRT_MIN)); + y = select(y, (short8)SHRT_MAX, convert_short8(x > (double8)SHRT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat(double16 x) +{ + short16 y = convert_short16(x); + y = select(y, (short16)SHRT_MIN, convert_short16(x < (double16)SHRT_MIN)); + y = select(y, (short16)SHRT_MAX, convert_short16(x > (double16)SHRT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat(double x) +{ + ushort y = convert_ushort(x); + y = select(y, (ushort)0, as_ushort(convert_short(x < (double)0))); + y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (double)USHRT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat(double2 x) +{ + ushort2 y = convert_ushort2(x); + y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (double2)0))); + y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (double2)USHRT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat(double3 x) +{ + ushort3 y = convert_ushort3(x); + y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (double3)0))); + y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (double3)USHRT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat(double4 x) +{ + ushort4 y = convert_ushort4(x); + y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (double4)0))); + y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (double4)USHRT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat(double8 x) +{ + ushort8 y = convert_ushort8(x); + y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (double8)0))); + y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (double8)USHRT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat(double16 x) +{ + ushort16 y = convert_ushort16(x); + y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (double16)0))); + y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (double16)USHRT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat(double x) +{ + int y = convert_int(x); + y = select(y, (int)INT_MIN, convert_int(x < (double)INT_MIN)); + y = select(y, (int)INT_MAX, convert_int(x > (double)INT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat(double2 x) +{ + int2 y = convert_int2(x); + y = select(y, (int2)INT_MIN, convert_int2(x < (double2)INT_MIN)); + y = select(y, (int2)INT_MAX, convert_int2(x > (double2)INT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat(double3 x) +{ + int3 y = convert_int3(x); + y = select(y, (int3)INT_MIN, convert_int3(x < (double3)INT_MIN)); + y = select(y, (int3)INT_MAX, convert_int3(x > (double3)INT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat(double4 x) +{ + int4 y = convert_int4(x); + y = select(y, (int4)INT_MIN, convert_int4(x < (double4)INT_MIN)); + y = select(y, (int4)INT_MAX, convert_int4(x > (double4)INT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat(double8 x) +{ + int8 y = convert_int8(x); + y = select(y, (int8)INT_MIN, convert_int8(x < (double8)INT_MIN)); + y = select(y, (int8)INT_MAX, convert_int8(x > (double8)INT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat(double16 x) +{ + int16 y = convert_int16(x); + y = select(y, (int16)INT_MIN, convert_int16(x < (double16)INT_MIN)); + y = select(y, (int16)INT_MAX, convert_int16(x > (double16)INT_MAX)); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat(double x) +{ + uint y = convert_uint(x); + y = select(y, (uint)0, as_uint(convert_int(x < (double)0))); + y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (double)UINT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat(double2 x) +{ + uint2 y = convert_uint2(x); + y = select(y, (uint2)0, as_uint2(convert_int2(x < (double2)0))); + y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (double2)UINT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat(double3 x) +{ + uint3 y = convert_uint3(x); + y = select(y, (uint3)0, as_uint3(convert_int3(x < (double3)0))); + y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (double3)UINT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat(double4 x) +{ + uint4 y = convert_uint4(x); + y = select(y, (uint4)0, as_uint4(convert_int4(x < (double4)0))); + y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (double4)UINT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat(double8 x) +{ + uint8 y = convert_uint8(x); + y = select(y, (uint8)0, as_uint8(convert_int8(x < (double8)0))); + y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (double8)UINT_MAX))); + return y; +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat(double16 x) +{ + uint16 y = convert_uint16(x); + y = select(y, (uint16)0, as_uint16(convert_int16(x < (double16)0))); + y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (double16)UINT_MAX))); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat(double x) +{ + long y = convert_long(x); + y = select(y, (long)LONG_MIN, convert_long(x < (double)LONG_MIN)); + y = select(y, (long)LONG_MAX, convert_long(x > (double)LONG_MAX)); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat(double2 x) +{ + long2 y = convert_long2(x); + y = select(y, (long2)LONG_MIN, convert_long2(x < (double2)LONG_MIN)); + y = select(y, (long2)LONG_MAX, convert_long2(x > (double2)LONG_MAX)); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat(double3 x) +{ + long3 y = convert_long3(x); + y = select(y, (long3)LONG_MIN, convert_long3(x < (double3)LONG_MIN)); + y = select(y, (long3)LONG_MAX, convert_long3(x > (double3)LONG_MAX)); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat(double4 x) +{ + long4 y = convert_long4(x); + y = select(y, (long4)LONG_MIN, convert_long4(x < (double4)LONG_MIN)); + y = select(y, (long4)LONG_MAX, convert_long4(x > (double4)LONG_MAX)); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat(double8 x) +{ + long8 y = convert_long8(x); + y = select(y, (long8)LONG_MIN, convert_long8(x < (double8)LONG_MIN)); + y = select(y, (long8)LONG_MAX, convert_long8(x > (double8)LONG_MAX)); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat(double16 x) +{ + long16 y = convert_long16(x); + y = select(y, (long16)LONG_MIN, convert_long16(x < (double16)LONG_MIN)); + y = select(y, (long16)LONG_MAX, convert_long16(x > (double16)LONG_MAX)); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat(double x) +{ + ulong y = convert_ulong(x); + y = select(y, (ulong)0, as_ulong(convert_long(x < (double)0))); + y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (double)ULONG_MAX))); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat(double2 x) +{ + ulong2 y = convert_ulong2(x); + y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (double2)0))); + y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (double2)ULONG_MAX))); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat(double3 x) +{ + ulong3 y = convert_ulong3(x); + y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (double3)0))); + y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (double3)ULONG_MAX))); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat(double4 x) +{ + ulong4 y = convert_ulong4(x); + y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (double4)0))); + y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (double4)ULONG_MAX))); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat(double8 x) +{ + ulong8 y = convert_ulong8(x); + y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (double8)0))); + y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (double8)ULONG_MAX))); + return y; +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat(double16 x) +{ + ulong16 y = convert_ulong16(x); + y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (double16)0))); + y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (double16)ULONG_MAX))); + return y; +} +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(char x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(char x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(char x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(char x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(char2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(char2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(char2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(char2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(char3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(char3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(char3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(char3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(char4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(char4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(char4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(char4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(char8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(char8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(char8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(char8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(char16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(char16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(char16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(char16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(char x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(char x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(char x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(char x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(char2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(char2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(char2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(char2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(char3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(char3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(char3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(char3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(char4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(char4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(char4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(char4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(char8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(char8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(char8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(char8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(char16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(char16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(char16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(char16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(char x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(char x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(char x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(char x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(char2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(char2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(char2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(char2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(char3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(char3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(char3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(char3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(char4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(char4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(char4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(char4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(char8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(char8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(char8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(char8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(char16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(char16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(char16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(char16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(char x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(char x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(char x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(char x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(char2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(char2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(char2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(char2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(char3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(char3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(char3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(char3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(char4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(char4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(char4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(char4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(char8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(char8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(char8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(char8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(char16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(char16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(char16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(char16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(char x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(char x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(char x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(char x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(char2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(char2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(char2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(char2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(char3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(char3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(char3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(char3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(char4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(char4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(char4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(char4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(char8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(char8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(char8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(char8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(char16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(char16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(char16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(char16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(char x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(char x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(char x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(char x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(char2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(char2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(char2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(char2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(char3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(char3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(char3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(char3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(char4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(char4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(char4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(char4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(char8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(char8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(char8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(char8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(char16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(char16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(char16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(char16 x) +{ + return convert_uint16_sat(x); +} + +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(char x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(char x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(char x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(char x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(char2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(char2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(char2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(char2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(char3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(char3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(char3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(char3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(char4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(char4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(char4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(char4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(char8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(char8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(char8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(char8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(char16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(char16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(char16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(char16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(char x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(char x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(char x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(char x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(char2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(char2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(char2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(char2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(char3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(char3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(char3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(char3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(char4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(char4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(char4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(char4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(char8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(char8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(char8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(char8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(char16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(char16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(char16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(char16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(uchar x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(uchar x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(uchar x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(uchar x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(uchar2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(uchar2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(uchar2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(uchar2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(uchar3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(uchar3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(uchar3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(uchar3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(uchar4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(uchar4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(uchar4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(uchar4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(uchar8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(uchar8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(uchar8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(uchar8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(uchar16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(uchar16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(uchar16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(uchar16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(uchar x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(uchar x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(uchar x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(uchar x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(uchar2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(uchar2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(uchar2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(uchar2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(uchar3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(uchar3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(uchar3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(uchar3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(uchar4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(uchar4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(uchar4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(uchar4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(uchar8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(uchar8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(uchar8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(uchar8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(uchar16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(uchar16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(uchar16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(uchar16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(uchar x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(uchar x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(uchar x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(uchar x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(uchar2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(uchar2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(uchar2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(uchar2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(uchar3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(uchar3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(uchar3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(uchar3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(uchar4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(uchar4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(uchar4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(uchar4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(uchar8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(uchar8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(uchar8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(uchar8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(uchar16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(uchar16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(uchar16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(uchar16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(uchar x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(uchar x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(uchar x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(uchar x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(uchar2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(uchar2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(uchar2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(uchar2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(uchar3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(uchar3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(uchar3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(uchar3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(uchar4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(uchar4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(uchar4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(uchar4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(uchar8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(uchar8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(uchar8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(uchar8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(uchar16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(uchar16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(uchar16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(uchar16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(uchar x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(uchar x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(uchar x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(uchar x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(uchar2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(uchar2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(uchar2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(uchar2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(uchar3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(uchar3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(uchar3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(uchar3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(uchar4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(uchar4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(uchar4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(uchar4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(uchar8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(uchar8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(uchar8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(uchar8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(uchar16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(uchar16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(uchar16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(uchar16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(uchar x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(uchar x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(uchar x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(uchar x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(uchar2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(uchar2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(uchar2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(uchar2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(uchar3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(uchar3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(uchar3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(uchar3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(uchar4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(uchar4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(uchar4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(uchar4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(uchar8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(uchar8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(uchar8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(uchar8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(uchar16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(uchar16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(uchar16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(uchar16 x) +{ + return convert_uint16_sat(x); +} + +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(uchar x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(uchar x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(uchar x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(uchar x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(uchar2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(uchar2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(uchar2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(uchar2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(uchar3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(uchar3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(uchar3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(uchar3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(uchar4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(uchar4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(uchar4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(uchar4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(uchar8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(uchar8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(uchar8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(uchar8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(uchar16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(uchar16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(uchar16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(uchar16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(uchar x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(uchar x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(uchar x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(uchar x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(uchar2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(uchar2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(uchar2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(uchar2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(uchar3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(uchar3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(uchar3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(uchar3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(uchar4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(uchar4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(uchar4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(uchar4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(uchar8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(uchar8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(uchar8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(uchar8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(uchar16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(uchar16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(uchar16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(uchar16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(short x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(short x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(short x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(short x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(short2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(short2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(short2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(short2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(short3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(short3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(short3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(short3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(short4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(short4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(short4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(short4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(short8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(short8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(short8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(short8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(short16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(short16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(short16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(short16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(short x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(short x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(short x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(short x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(short2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(short2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(short2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(short2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(short3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(short3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(short3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(short3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(short4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(short4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(short4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(short4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(short8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(short8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(short8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(short8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(short16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(short16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(short16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(short16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(short x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(short x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(short x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(short x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(short2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(short2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(short2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(short2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(short3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(short3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(short3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(short3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(short4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(short4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(short4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(short4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(short8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(short8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(short8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(short8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(short16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(short16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(short16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(short16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(short x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(short x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(short x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(short x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(short2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(short2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(short2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(short2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(short3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(short3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(short3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(short3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(short4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(short4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(short4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(short4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(short8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(short8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(short8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(short8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(short16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(short16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(short16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(short16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(short x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(short x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(short x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(short x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(short2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(short2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(short2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(short2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(short3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(short3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(short3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(short3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(short4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(short4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(short4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(short4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(short8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(short8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(short8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(short8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(short16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(short16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(short16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(short16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(short x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(short x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(short x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(short x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(short2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(short2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(short2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(short2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(short3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(short3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(short3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(short3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(short4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(short4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(short4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(short4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(short8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(short8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(short8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(short8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(short16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(short16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(short16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(short16 x) +{ + return convert_uint16_sat(x); +} + +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(short x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(short x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(short x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(short x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(short2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(short2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(short2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(short2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(short3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(short3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(short3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(short3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(short4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(short4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(short4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(short4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(short8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(short8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(short8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(short8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(short16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(short16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(short16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(short16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(short x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(short x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(short x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(short x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(short2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(short2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(short2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(short2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(short3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(short3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(short3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(short3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(short4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(short4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(short4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(short4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(short8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(short8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(short8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(short8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(short16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(short16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(short16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(short16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(ushort x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(ushort x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(ushort x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(ushort x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(ushort2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(ushort2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(ushort2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(ushort2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(ushort3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(ushort3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(ushort3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(ushort3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(ushort4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(ushort4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(ushort4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(ushort4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(ushort8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(ushort8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(ushort8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(ushort8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(ushort16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(ushort16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(ushort16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(ushort16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(ushort x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(ushort x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(ushort x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(ushort x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(ushort2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(ushort2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(ushort2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(ushort2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(ushort3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(ushort3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(ushort3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(ushort3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(ushort4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(ushort4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(ushort4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(ushort4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(ushort8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(ushort8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(ushort8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(ushort8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(ushort16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(ushort16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(ushort16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(ushort16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(ushort x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(ushort x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(ushort x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(ushort x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(ushort2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(ushort2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(ushort2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(ushort2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(ushort3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(ushort3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(ushort3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(ushort3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(ushort4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(ushort4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(ushort4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(ushort4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(ushort8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(ushort8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(ushort8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(ushort8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(ushort16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(ushort16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(ushort16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(ushort16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(ushort x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(ushort x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(ushort x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(ushort x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(ushort2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(ushort2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(ushort2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(ushort2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(ushort3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(ushort3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(ushort3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(ushort3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(ushort4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(ushort4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(ushort4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(ushort4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(ushort8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(ushort8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(ushort8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(ushort8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(ushort16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(ushort16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(ushort16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(ushort16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(ushort x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(ushort x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(ushort x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(ushort x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(ushort2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(ushort2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(ushort2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(ushort2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(ushort3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(ushort3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(ushort3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(ushort3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(ushort4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(ushort4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(ushort4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(ushort4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(ushort8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(ushort8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(ushort8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(ushort8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(ushort16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(ushort16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(ushort16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(ushort16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(ushort x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(ushort x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(ushort x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(ushort x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(ushort2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(ushort2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(ushort2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(ushort2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(ushort3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(ushort3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(ushort3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(ushort3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(ushort4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(ushort4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(ushort4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(ushort4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(ushort8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(ushort8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(ushort8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(ushort8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(ushort16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(ushort16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(ushort16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(ushort16 x) +{ + return convert_uint16_sat(x); +} + +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(ushort x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(ushort x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(ushort x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(ushort x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(ushort2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(ushort2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(ushort2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(ushort2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(ushort3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(ushort3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(ushort3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(ushort3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(ushort4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(ushort4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(ushort4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(ushort4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(ushort8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(ushort8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(ushort8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(ushort8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(ushort16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(ushort16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(ushort16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(ushort16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(ushort x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(ushort x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(ushort x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(ushort x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(ushort2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(ushort2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(ushort2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(ushort2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(ushort3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(ushort3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(ushort3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(ushort3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(ushort4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(ushort4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(ushort4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(ushort4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(ushort8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(ushort8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(ushort8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(ushort8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(ushort16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(ushort16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(ushort16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(ushort16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(int x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(int x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(int x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(int x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(int2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(int2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(int2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(int2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(int3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(int3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(int3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(int3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(int4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(int4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(int4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(int4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(int8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(int8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(int8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(int8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(int16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(int16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(int16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(int16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(int x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(int x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(int x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(int x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(int2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(int2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(int2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(int2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(int3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(int3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(int3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(int3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(int4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(int4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(int4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(int4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(int8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(int8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(int8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(int8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(int16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(int16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(int16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(int16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(int x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(int x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(int x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(int x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(int2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(int2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(int2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(int2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(int3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(int3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(int3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(int3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(int4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(int4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(int4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(int4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(int8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(int8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(int8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(int8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(int16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(int16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(int16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(int16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(int x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(int x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(int x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(int x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(int2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(int2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(int2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(int2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(int3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(int3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(int3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(int3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(int4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(int4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(int4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(int4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(int8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(int8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(int8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(int8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(int16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(int16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(int16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(int16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(int x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(int x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(int x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(int x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(int2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(int2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(int2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(int2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(int3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(int3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(int3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(int3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(int4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(int4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(int4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(int4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(int8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(int8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(int8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(int8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(int16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(int16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(int16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(int16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(int x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(int x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(int x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(int x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(int2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(int2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(int2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(int2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(int3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(int3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(int3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(int3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(int4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(int4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(int4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(int4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(int8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(int8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(int8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(int8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(int16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(int16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(int16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(int16 x) +{ + return convert_uint16_sat(x); +} + +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(int x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(int x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(int x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(int x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(int2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(int2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(int2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(int2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(int3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(int3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(int3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(int3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(int4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(int4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(int4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(int4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(int8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(int8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(int8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(int8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(int16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(int16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(int16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(int16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(int x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(int x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(int x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(int x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(int2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(int2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(int2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(int2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(int3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(int3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(int3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(int3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(int4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(int4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(int4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(int4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(int8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(int8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(int8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(int8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(int16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(int16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(int16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(int16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(uint x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(uint x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(uint x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(uint x) +{ + return convert_char_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(uint2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(uint2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(uint2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(uint2 x) +{ + return convert_char2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(uint3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(uint3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(uint3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(uint3 x) +{ + return convert_char3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(uint4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(uint4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(uint4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(uint4 x) +{ + return convert_char4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(uint8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(uint8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(uint8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(uint8 x) +{ + return convert_char8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(uint16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(uint16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(uint16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(uint16 x) +{ + return convert_char16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(uint x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(uint x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(uint x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(uint x) +{ + return convert_uchar_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(uint2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(uint2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(uint2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(uint2 x) +{ + return convert_uchar2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(uint3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(uint3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(uint3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(uint3 x) +{ + return convert_uchar3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(uint4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(uint4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(uint4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(uint4 x) +{ + return convert_uchar4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(uint8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(uint8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(uint8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(uint8 x) +{ + return convert_uchar8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(uint16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(uint16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(uint16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(uint16 x) +{ + return convert_uchar16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(uint x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(uint x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(uint x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(uint x) +{ + return convert_short_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(uint2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(uint2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(uint2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(uint2 x) +{ + return convert_short2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(uint3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(uint3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(uint3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(uint3 x) +{ + return convert_short3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(uint4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(uint4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(uint4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(uint4 x) +{ + return convert_short4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(uint8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(uint8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(uint8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(uint8 x) +{ + return convert_short8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(uint16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(uint16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(uint16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(uint16 x) +{ + return convert_short16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(uint x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(uint x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(uint x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(uint x) +{ + return convert_ushort_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(uint2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(uint2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(uint2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(uint2 x) +{ + return convert_ushort2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(uint3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(uint3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(uint3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(uint3 x) +{ + return convert_ushort3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(uint4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(uint4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(uint4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(uint4 x) +{ + return convert_ushort4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(uint8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(uint8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(uint8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(uint8 x) +{ + return convert_ushort8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(uint16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(uint16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(uint16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(uint16 x) +{ + return convert_ushort16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(uint x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(uint x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(uint x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(uint x) +{ + return convert_int_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(uint2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(uint2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(uint2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(uint2 x) +{ + return convert_int2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(uint3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(uint3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(uint3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(uint3 x) +{ + return convert_int3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(uint4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(uint4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(uint4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(uint4 x) +{ + return convert_int4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(uint8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(uint8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(uint8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(uint8 x) +{ + return convert_int8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(uint16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(uint16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(uint16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(uint16 x) +{ + return convert_int16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(uint x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(uint x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(uint x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(uint x) +{ + return convert_uint_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(uint2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(uint2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(uint2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(uint2 x) +{ + return convert_uint2_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(uint3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(uint3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(uint3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(uint3 x) +{ + return convert_uint3_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(uint4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(uint4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(uint4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(uint4 x) +{ + return convert_uint4_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(uint8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(uint8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(uint8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(uint8 x) +{ + return convert_uint8_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(uint16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(uint16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(uint16 x) +{ + return convert_uint16_sat(x); +} + +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(uint16 x) +{ + return convert_uint16_sat(x); +} + +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(uint x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(uint x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(uint x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(uint x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(uint2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(uint2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(uint2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(uint2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(uint3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(uint3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(uint3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(uint3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(uint4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(uint4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(uint4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(uint4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(uint8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(uint8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(uint8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(uint8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(uint16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(uint16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(uint16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(uint16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(uint x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(uint x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(uint x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(uint x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(uint2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(uint2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(uint2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(uint2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(uint3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(uint3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(uint3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(uint3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(uint4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(uint4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(uint4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(uint4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(uint8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(uint8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(uint8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(uint8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(uint16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(uint16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(uint16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(uint16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(long x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(long x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(long x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(long x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(long2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(long2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(long2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(long2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(long3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(long3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(long3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(long3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(long4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(long4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(long4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(long4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(long8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(long8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(long8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(long8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(long16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(long16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(long16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(long16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(long x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(long x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(long x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(long x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(long2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(long2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(long2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(long2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(long3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(long3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(long3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(long3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(long4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(long4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(long4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(long4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(long8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(long8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(long8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(long8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(long16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(long16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(long16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(long16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(long x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(long x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(long x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(long x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(long2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(long2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(long2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(long2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(long3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(long3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(long3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(long3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(long4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(long4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(long4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(long4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(long8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(long8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(long8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(long8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(long16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(long16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(long16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(long16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(long x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(long x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(long x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(long x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(long2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(long2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(long2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(long2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(long3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(long3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(long3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(long3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(long4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(long4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(long4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(long4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(long8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(long8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(long8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(long8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(long16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(long16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(long16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(long16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(long x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(long x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(long x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(long x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(long2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(long2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(long2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(long2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(long3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(long3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(long3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(long3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(long4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(long4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(long4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(long4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(long8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(long8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(long8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(long8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(long16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(long16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(long16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(long16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(long x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(long x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(long x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(long x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(long2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(long2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(long2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(long2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(long3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(long3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(long3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(long3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(long4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(long4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(long4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(long4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(long8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(long8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(long8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(long8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(long16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(long16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(long16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(long16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(long x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(long x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(long x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(long x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(long2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(long2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(long2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(long2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(long3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(long3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(long3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(long3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(long4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(long4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(long4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(long4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(long8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(long8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(long8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(long8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(long16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(long16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(long16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(long16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(long x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(long x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(long x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(long x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(long2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(long2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(long2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(long2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(long3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(long3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(long3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(long3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(long4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(long4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(long4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(long4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(long8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(long8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(long8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(long8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(long16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(long16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(long16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(long16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(ulong x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(ulong x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(ulong x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(ulong x) +{ + return convert_char_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(ulong2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(ulong2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(ulong2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(ulong2 x) +{ + return convert_char2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(ulong3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(ulong3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(ulong3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(ulong3 x) +{ + return convert_char3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(ulong4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(ulong4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(ulong4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(ulong4 x) +{ + return convert_char4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(ulong8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(ulong8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(ulong8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(ulong8 x) +{ + return convert_char8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(ulong16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(ulong16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(ulong16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(ulong16 x) +{ + return convert_char16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(ulong x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(ulong x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(ulong x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(ulong x) +{ + return convert_uchar_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(ulong2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(ulong2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(ulong2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(ulong2 x) +{ + return convert_uchar2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(ulong3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(ulong3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(ulong3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(ulong3 x) +{ + return convert_uchar3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(ulong4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(ulong4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(ulong4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(ulong4 x) +{ + return convert_uchar4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(ulong8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(ulong8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(ulong8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(ulong8 x) +{ + return convert_uchar8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(ulong16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(ulong16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(ulong16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(ulong16 x) +{ + return convert_uchar16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(ulong x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(ulong x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(ulong x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(ulong x) +{ + return convert_short_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(ulong2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(ulong2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(ulong2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(ulong2 x) +{ + return convert_short2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(ulong3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(ulong3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(ulong3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(ulong3 x) +{ + return convert_short3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(ulong4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(ulong4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(ulong4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(ulong4 x) +{ + return convert_short4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(ulong8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(ulong8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(ulong8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(ulong8 x) +{ + return convert_short8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(ulong16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(ulong16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(ulong16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(ulong16 x) +{ + return convert_short16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(ulong x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(ulong x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(ulong x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(ulong x) +{ + return convert_ushort_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(ulong2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(ulong2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(ulong2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(ulong2 x) +{ + return convert_ushort2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(ulong3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(ulong3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(ulong3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(ulong3 x) +{ + return convert_ushort3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(ulong4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(ulong4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(ulong4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(ulong4 x) +{ + return convert_ushort4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(ulong8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(ulong8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(ulong8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(ulong8 x) +{ + return convert_ushort8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(ulong16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(ulong16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(ulong16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(ulong16 x) +{ + return convert_ushort16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(ulong x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(ulong x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(ulong x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(ulong x) +{ + return convert_int_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(ulong2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(ulong2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(ulong2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(ulong2 x) +{ + return convert_int2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(ulong3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(ulong3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(ulong3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(ulong3 x) +{ + return convert_int3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(ulong4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(ulong4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(ulong4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(ulong4 x) +{ + return convert_int4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(ulong8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(ulong8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(ulong8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(ulong8 x) +{ + return convert_int8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(ulong16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(ulong16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(ulong16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(ulong16 x) +{ + return convert_int16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(ulong x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(ulong x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(ulong x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(ulong x) +{ + return convert_uint_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(ulong2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(ulong2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(ulong2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(ulong2 x) +{ + return convert_uint2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(ulong3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(ulong3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(ulong3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(ulong3 x) +{ + return convert_uint3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(ulong4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(ulong4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(ulong4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(ulong4 x) +{ + return convert_uint4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(ulong8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(ulong8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(ulong8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(ulong8 x) +{ + return convert_uint8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(ulong16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(ulong16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(ulong16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(ulong16 x) +{ + return convert_uint16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(ulong x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(ulong x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(ulong x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(ulong x) +{ + return convert_long_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(ulong2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(ulong2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(ulong2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(ulong2 x) +{ + return convert_long2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(ulong3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(ulong3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(ulong3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(ulong3 x) +{ + return convert_long3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(ulong4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(ulong4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(ulong4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(ulong4 x) +{ + return convert_long4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(ulong8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(ulong8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(ulong8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(ulong8 x) +{ + return convert_long8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(ulong16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(ulong16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(ulong16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(ulong16 x) +{ + return convert_long16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(ulong x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(ulong x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(ulong x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(ulong x) +{ + return convert_ulong_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(ulong2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(ulong2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(ulong2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(ulong2 x) +{ + return convert_ulong2_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(ulong3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(ulong3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(ulong3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(ulong3 x) +{ + return convert_ulong3_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(ulong4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(ulong4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(ulong4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(ulong4 x) +{ + return convert_ulong4_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(ulong8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(ulong8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(ulong8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(ulong8 x) +{ + return convert_ulong8_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(ulong16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(ulong16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(ulong16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(ulong16 x) +{ + return convert_ulong16_sat(x); +} + +#endif +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(float x) +{ + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(float x) +{ + return convert_char_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(float x) +{ + x = rint(x); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(float x) +{ + x = rint(x); + return convert_char_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(float x) +{ + x = ceil(x); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(float x) +{ + x = ceil(x); + return convert_char_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(float x) +{ + x = floor(x); + return convert_char(x); +} +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(float x) +{ + x = floor(x); + return convert_char_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(float2 x) +{ + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(float2 x) +{ + return convert_char2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(float2 x) +{ + x = rint(x); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(float2 x) +{ + x = rint(x); + return convert_char2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(float2 x) +{ + x = ceil(x); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_char2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(float2 x) +{ + x = floor(x); + return convert_char2(x); +} +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_char2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(float3 x) +{ + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(float3 x) +{ + return convert_char3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(float3 x) +{ + x = rint(x); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(float3 x) +{ + x = rint(x); + return convert_char3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(float3 x) +{ + x = ceil(x); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_char3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(float3 x) +{ + x = floor(x); + return convert_char3(x); +} +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_char3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(float4 x) +{ + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(float4 x) +{ + return convert_char4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(float4 x) +{ + x = rint(x); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(float4 x) +{ + x = rint(x); + return convert_char4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(float4 x) +{ + x = ceil(x); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_char4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(float4 x) +{ + x = floor(x); + return convert_char4(x); +} +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_char4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(float8 x) +{ + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(float8 x) +{ + return convert_char8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(float8 x) +{ + x = rint(x); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(float8 x) +{ + x = rint(x); + return convert_char8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(float8 x) +{ + x = ceil(x); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_char8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(float8 x) +{ + x = floor(x); + return convert_char8(x); +} +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_char8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(float16 x) +{ + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(float16 x) +{ + return convert_char16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(float16 x) +{ + x = rint(x); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(float16 x) +{ + x = rint(x); + return convert_char16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(float16 x) +{ + x = ceil(x); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_char16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(float16 x) +{ + x = floor(x); + return convert_char16(x); +} +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_char16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(float x) +{ + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(float x) +{ + return convert_uchar_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(float x) +{ + x = rint(x); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(float x) +{ + x = rint(x); + return convert_uchar_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(float x) +{ + x = ceil(x); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(float x) +{ + x = ceil(x); + return convert_uchar_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(float x) +{ + x = floor(x); + return convert_uchar(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(float x) +{ + x = floor(x); + return convert_uchar_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(float2 x) +{ + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(float2 x) +{ + return convert_uchar2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(float2 x) +{ + x = rint(x); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(float2 x) +{ + x = rint(x); + return convert_uchar2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(float2 x) +{ + x = ceil(x); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_uchar2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(float2 x) +{ + x = floor(x); + return convert_uchar2(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_uchar2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(float3 x) +{ + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(float3 x) +{ + return convert_uchar3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(float3 x) +{ + x = rint(x); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(float3 x) +{ + x = rint(x); + return convert_uchar3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(float3 x) +{ + x = ceil(x); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_uchar3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(float3 x) +{ + x = floor(x); + return convert_uchar3(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_uchar3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(float4 x) +{ + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(float4 x) +{ + return convert_uchar4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(float4 x) +{ + x = rint(x); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(float4 x) +{ + x = rint(x); + return convert_uchar4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(float4 x) +{ + x = ceil(x); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_uchar4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(float4 x) +{ + x = floor(x); + return convert_uchar4(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_uchar4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(float8 x) +{ + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(float8 x) +{ + return convert_uchar8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(float8 x) +{ + x = rint(x); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(float8 x) +{ + x = rint(x); + return convert_uchar8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(float8 x) +{ + x = ceil(x); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_uchar8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(float8 x) +{ + x = floor(x); + return convert_uchar8(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_uchar8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(float16 x) +{ + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(float16 x) +{ + return convert_uchar16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(float16 x) +{ + x = rint(x); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(float16 x) +{ + x = rint(x); + return convert_uchar16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(float16 x) +{ + x = ceil(x); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_uchar16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(float16 x) +{ + x = floor(x); + return convert_uchar16(x); +} +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_uchar16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(float x) +{ + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(float x) +{ + return convert_short_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(float x) +{ + x = rint(x); + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(float x) +{ + x = rint(x); + return convert_short_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(float x) +{ + x = ceil(x); + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(float x) +{ + x = ceil(x); + return convert_short_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(float x) +{ + x = floor(x); + return convert_short(x); +} +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(float x) +{ + x = floor(x); + return convert_short_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(float2 x) +{ + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(float2 x) +{ + return convert_short2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(float2 x) +{ + x = rint(x); + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(float2 x) +{ + x = rint(x); + return convert_short2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(float2 x) +{ + x = ceil(x); + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_short2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(float2 x) +{ + x = floor(x); + return convert_short2(x); +} +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_short2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(float3 x) +{ + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(float3 x) +{ + return convert_short3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(float3 x) +{ + x = rint(x); + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(float3 x) +{ + x = rint(x); + return convert_short3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(float3 x) +{ + x = ceil(x); + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_short3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(float3 x) +{ + x = floor(x); + return convert_short3(x); +} +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_short3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(float4 x) +{ + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(float4 x) +{ + return convert_short4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(float4 x) +{ + x = rint(x); + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(float4 x) +{ + x = rint(x); + return convert_short4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(float4 x) +{ + x = ceil(x); + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_short4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(float4 x) +{ + x = floor(x); + return convert_short4(x); +} +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_short4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(float8 x) +{ + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(float8 x) +{ + return convert_short8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(float8 x) +{ + x = rint(x); + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(float8 x) +{ + x = rint(x); + return convert_short8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(float8 x) +{ + x = ceil(x); + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_short8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(float8 x) +{ + x = floor(x); + return convert_short8(x); +} +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_short8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(float16 x) +{ + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(float16 x) +{ + return convert_short16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(float16 x) +{ + x = rint(x); + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(float16 x) +{ + x = rint(x); + return convert_short16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(float16 x) +{ + x = ceil(x); + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_short16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(float16 x) +{ + x = floor(x); + return convert_short16(x); +} +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_short16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(float x) +{ + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(float x) +{ + return convert_ushort_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(float x) +{ + x = rint(x); + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(float x) +{ + x = rint(x); + return convert_ushort_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(float x) +{ + x = ceil(x); + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(float x) +{ + x = ceil(x); + return convert_ushort_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(float x) +{ + x = floor(x); + return convert_ushort(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(float x) +{ + x = floor(x); + return convert_ushort_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(float2 x) +{ + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(float2 x) +{ + return convert_ushort2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(float2 x) +{ + x = rint(x); + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(float2 x) +{ + x = rint(x); + return convert_ushort2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(float2 x) +{ + x = ceil(x); + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_ushort2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(float2 x) +{ + x = floor(x); + return convert_ushort2(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_ushort2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(float3 x) +{ + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(float3 x) +{ + return convert_ushort3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(float3 x) +{ + x = rint(x); + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(float3 x) +{ + x = rint(x); + return convert_ushort3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(float3 x) +{ + x = ceil(x); + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_ushort3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(float3 x) +{ + x = floor(x); + return convert_ushort3(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_ushort3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(float4 x) +{ + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(float4 x) +{ + return convert_ushort4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(float4 x) +{ + x = rint(x); + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(float4 x) +{ + x = rint(x); + return convert_ushort4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(float4 x) +{ + x = ceil(x); + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_ushort4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(float4 x) +{ + x = floor(x); + return convert_ushort4(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_ushort4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(float8 x) +{ + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(float8 x) +{ + return convert_ushort8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(float8 x) +{ + x = rint(x); + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(float8 x) +{ + x = rint(x); + return convert_ushort8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(float8 x) +{ + x = ceil(x); + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_ushort8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(float8 x) +{ + x = floor(x); + return convert_ushort8(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_ushort8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(float16 x) +{ + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(float16 x) +{ + return convert_ushort16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(float16 x) +{ + x = rint(x); + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(float16 x) +{ + x = rint(x); + return convert_ushort16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(float16 x) +{ + x = ceil(x); + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_ushort16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(float16 x) +{ + x = floor(x); + return convert_ushort16(x); +} +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_ushort16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(float x) +{ + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(float x) +{ + return convert_int_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(float x) +{ + x = rint(x); + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(float x) +{ + x = rint(x); + return convert_int_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(float x) +{ + x = ceil(x); + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(float x) +{ + x = ceil(x); + return convert_int_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(float x) +{ + x = floor(x); + return convert_int(x); +} +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(float x) +{ + x = floor(x); + return convert_int_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(float2 x) +{ + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(float2 x) +{ + return convert_int2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(float2 x) +{ + x = rint(x); + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(float2 x) +{ + x = rint(x); + return convert_int2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(float2 x) +{ + x = ceil(x); + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_int2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(float2 x) +{ + x = floor(x); + return convert_int2(x); +} +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_int2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(float3 x) +{ + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(float3 x) +{ + return convert_int3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(float3 x) +{ + x = rint(x); + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(float3 x) +{ + x = rint(x); + return convert_int3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(float3 x) +{ + x = ceil(x); + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_int3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(float3 x) +{ + x = floor(x); + return convert_int3(x); +} +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_int3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(float4 x) +{ + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(float4 x) +{ + return convert_int4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(float4 x) +{ + x = rint(x); + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(float4 x) +{ + x = rint(x); + return convert_int4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(float4 x) +{ + x = ceil(x); + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_int4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(float4 x) +{ + x = floor(x); + return convert_int4(x); +} +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_int4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(float8 x) +{ + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(float8 x) +{ + return convert_int8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(float8 x) +{ + x = rint(x); + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(float8 x) +{ + x = rint(x); + return convert_int8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(float8 x) +{ + x = ceil(x); + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_int8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(float8 x) +{ + x = floor(x); + return convert_int8(x); +} +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_int8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(float16 x) +{ + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(float16 x) +{ + return convert_int16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(float16 x) +{ + x = rint(x); + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(float16 x) +{ + x = rint(x); + return convert_int16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(float16 x) +{ + x = ceil(x); + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_int16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(float16 x) +{ + x = floor(x); + return convert_int16(x); +} +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_int16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(float x) +{ + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(float x) +{ + return convert_uint_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(float x) +{ + x = rint(x); + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(float x) +{ + x = rint(x); + return convert_uint_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(float x) +{ + x = ceil(x); + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(float x) +{ + x = ceil(x); + return convert_uint_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(float x) +{ + x = floor(x); + return convert_uint(x); +} +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(float x) +{ + x = floor(x); + return convert_uint_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(float2 x) +{ + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(float2 x) +{ + return convert_uint2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(float2 x) +{ + x = rint(x); + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(float2 x) +{ + x = rint(x); + return convert_uint2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(float2 x) +{ + x = ceil(x); + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_uint2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(float2 x) +{ + x = floor(x); + return convert_uint2(x); +} +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_uint2_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(float3 x) +{ + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(float3 x) +{ + return convert_uint3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(float3 x) +{ + x = rint(x); + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(float3 x) +{ + x = rint(x); + return convert_uint3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(float3 x) +{ + x = ceil(x); + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_uint3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(float3 x) +{ + x = floor(x); + return convert_uint3(x); +} +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_uint3_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(float4 x) +{ + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(float4 x) +{ + return convert_uint4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(float4 x) +{ + x = rint(x); + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(float4 x) +{ + x = rint(x); + return convert_uint4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(float4 x) +{ + x = ceil(x); + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_uint4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(float4 x) +{ + x = floor(x); + return convert_uint4(x); +} +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_uint4_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(float8 x) +{ + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(float8 x) +{ + return convert_uint8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(float8 x) +{ + x = rint(x); + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(float8 x) +{ + x = rint(x); + return convert_uint8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(float8 x) +{ + x = ceil(x); + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_uint8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(float8 x) +{ + x = floor(x); + return convert_uint8(x); +} +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_uint8_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(float16 x) +{ + return convert_uint16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(float16 x) +{ + return convert_uint16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(float16 x) +{ + x = rint(x); + return convert_uint16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(float16 x) +{ + x = rint(x); + return convert_uint16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(float16 x) +{ + x = ceil(x); + return convert_uint16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_uint16_sat(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(float16 x) +{ + x = floor(x); + return convert_uint16(x); +} +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_uint16_sat(x); +} +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(float x) +{ + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(float x) +{ + return convert_long_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(float x) +{ + x = rint(x); + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(float x) +{ + x = rint(x); + return convert_long_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(float x) +{ + x = ceil(x); + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(float x) +{ + x = ceil(x); + return convert_long_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(float x) +{ + x = floor(x); + return convert_long(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(float x) +{ + x = floor(x); + return convert_long_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(float2 x) +{ + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(float2 x) +{ + return convert_long2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(float2 x) +{ + x = rint(x); + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(float2 x) +{ + x = rint(x); + return convert_long2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(float2 x) +{ + x = ceil(x); + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_long2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(float2 x) +{ + x = floor(x); + return convert_long2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_long2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(float3 x) +{ + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(float3 x) +{ + return convert_long3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(float3 x) +{ + x = rint(x); + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(float3 x) +{ + x = rint(x); + return convert_long3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(float3 x) +{ + x = ceil(x); + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_long3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(float3 x) +{ + x = floor(x); + return convert_long3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_long3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(float4 x) +{ + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(float4 x) +{ + return convert_long4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(float4 x) +{ + x = rint(x); + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(float4 x) +{ + x = rint(x); + return convert_long4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(float4 x) +{ + x = ceil(x); + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_long4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(float4 x) +{ + x = floor(x); + return convert_long4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_long4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(float8 x) +{ + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(float8 x) +{ + return convert_long8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(float8 x) +{ + x = rint(x); + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(float8 x) +{ + x = rint(x); + return convert_long8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(float8 x) +{ + x = ceil(x); + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_long8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(float8 x) +{ + x = floor(x); + return convert_long8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_long8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(float16 x) +{ + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(float16 x) +{ + return convert_long16_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(float16 x) +{ + x = rint(x); + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(float16 x) +{ + x = rint(x); + return convert_long16_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(float16 x) +{ + x = ceil(x); + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_long16_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(float16 x) +{ + x = floor(x); + return convert_long16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_long16_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(float x) +{ + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(float x) +{ + return convert_ulong_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(float x) +{ + x = rint(x); + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(float x) +{ + x = rint(x); + return convert_ulong_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(float x) +{ + x = ceil(x); + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(float x) +{ + x = ceil(x); + return convert_ulong_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(float x) +{ + x = floor(x); + return convert_ulong(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(float x) +{ + x = floor(x); + return convert_ulong_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(float2 x) +{ + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(float2 x) +{ + return convert_ulong2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(float2 x) +{ + x = rint(x); + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(float2 x) +{ + x = rint(x); + return convert_ulong2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(float2 x) +{ + x = ceil(x); + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(float2 x) +{ + x = ceil(x); + return convert_ulong2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(float2 x) +{ + x = floor(x); + return convert_ulong2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(float2 x) +{ + x = floor(x); + return convert_ulong2_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(float3 x) +{ + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(float3 x) +{ + return convert_ulong3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(float3 x) +{ + x = rint(x); + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(float3 x) +{ + x = rint(x); + return convert_ulong3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(float3 x) +{ + x = ceil(x); + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(float3 x) +{ + x = ceil(x); + return convert_ulong3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(float3 x) +{ + x = floor(x); + return convert_ulong3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(float3 x) +{ + x = floor(x); + return convert_ulong3_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(float4 x) +{ + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(float4 x) +{ + return convert_ulong4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(float4 x) +{ + x = rint(x); + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(float4 x) +{ + x = rint(x); + return convert_ulong4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(float4 x) +{ + x = ceil(x); + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(float4 x) +{ + x = ceil(x); + return convert_ulong4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(float4 x) +{ + x = floor(x); + return convert_ulong4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(float4 x) +{ + x = floor(x); + return convert_ulong4_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(float8 x) +{ + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(float8 x) +{ + return convert_ulong8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(float8 x) +{ + x = rint(x); + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(float8 x) +{ + x = rint(x); + return convert_ulong8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(float8 x) +{ + x = ceil(x); + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(float8 x) +{ + x = ceil(x); + return convert_ulong8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(float8 x) +{ + x = floor(x); + return convert_ulong8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(float8 x) +{ + x = floor(x); + return convert_ulong8_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(float16 x) +{ + return convert_ulong16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(float16 x) +{ + return convert_ulong16_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(float16 x) +{ + x = rint(x); + return convert_ulong16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(float16 x) +{ + x = rint(x); + return convert_ulong16_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(float16 x) +{ + x = ceil(x); + return convert_ulong16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(float16 x) +{ + x = ceil(x); + return convert_ulong16_sat(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(float16 x) +{ + x = floor(x); + return convert_ulong16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(float16 x) +{ + x = floor(x); + return convert_ulong16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtz(double x) +{ + return convert_char(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtz(double x) +{ + return convert_char_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rte(double x) +{ + x = rint(x); + return convert_char(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rte(double x) +{ + x = rint(x); + return convert_char_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtp(double x) +{ + x = ceil(x); + return convert_char(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtp(double x) +{ + x = ceil(x); + return convert_char_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_rtn(double x) +{ + x = floor(x); + return convert_char(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char convert_char_sat_rtn(double x) +{ + x = floor(x); + return convert_char_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtz(double2 x) +{ + return convert_char2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtz(double2 x) +{ + return convert_char2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rte(double2 x) +{ + x = rint(x); + return convert_char2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rte(double2 x) +{ + x = rint(x); + return convert_char2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtp(double2 x) +{ + x = ceil(x); + return convert_char2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_char2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_rtn(double2 x) +{ + x = floor(x); + return convert_char2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char2 convert_char2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_char2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtz(double3 x) +{ + return convert_char3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtz(double3 x) +{ + return convert_char3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rte(double3 x) +{ + x = rint(x); + return convert_char3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rte(double3 x) +{ + x = rint(x); + return convert_char3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtp(double3 x) +{ + x = ceil(x); + return convert_char3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_char3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_rtn(double3 x) +{ + x = floor(x); + return convert_char3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char3 convert_char3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_char3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtz(double4 x) +{ + return convert_char4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtz(double4 x) +{ + return convert_char4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rte(double4 x) +{ + x = rint(x); + return convert_char4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rte(double4 x) +{ + x = rint(x); + return convert_char4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtp(double4 x) +{ + x = ceil(x); + return convert_char4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_char4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_rtn(double4 x) +{ + x = floor(x); + return convert_char4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char4 convert_char4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_char4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtz(double8 x) +{ + return convert_char8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtz(double8 x) +{ + return convert_char8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rte(double8 x) +{ + x = rint(x); + return convert_char8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rte(double8 x) +{ + x = rint(x); + return convert_char8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtp(double8 x) +{ + x = ceil(x); + return convert_char8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_char8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_rtn(double8 x) +{ + x = floor(x); + return convert_char8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char8 convert_char8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_char8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtz(double16 x) +{ + return convert_char16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtz(double16 x) +{ + return convert_char16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rte(double16 x) +{ + x = rint(x); + return convert_char16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rte(double16 x) +{ + x = rint(x); + return convert_char16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtp(double16 x) +{ + x = ceil(x); + return convert_char16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_char16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_rtn(double16 x) +{ + x = floor(x); + return convert_char16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +char16 convert_char16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_char16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtz(double x) +{ + return convert_uchar(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtz(double x) +{ + return convert_uchar_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rte(double x) +{ + x = rint(x); + return convert_uchar(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rte(double x) +{ + x = rint(x); + return convert_uchar_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtp(double x) +{ + x = ceil(x); + return convert_uchar(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtp(double x) +{ + x = ceil(x); + return convert_uchar_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_rtn(double x) +{ + x = floor(x); + return convert_uchar(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar convert_uchar_sat_rtn(double x) +{ + x = floor(x); + return convert_uchar_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtz(double2 x) +{ + return convert_uchar2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtz(double2 x) +{ + return convert_uchar2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rte(double2 x) +{ + x = rint(x); + return convert_uchar2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rte(double2 x) +{ + x = rint(x); + return convert_uchar2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtp(double2 x) +{ + x = ceil(x); + return convert_uchar2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_uchar2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_rtn(double2 x) +{ + x = floor(x); + return convert_uchar2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar2 convert_uchar2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_uchar2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtz(double3 x) +{ + return convert_uchar3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtz(double3 x) +{ + return convert_uchar3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rte(double3 x) +{ + x = rint(x); + return convert_uchar3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rte(double3 x) +{ + x = rint(x); + return convert_uchar3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtp(double3 x) +{ + x = ceil(x); + return convert_uchar3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_uchar3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_rtn(double3 x) +{ + x = floor(x); + return convert_uchar3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar3 convert_uchar3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_uchar3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtz(double4 x) +{ + return convert_uchar4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtz(double4 x) +{ + return convert_uchar4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rte(double4 x) +{ + x = rint(x); + return convert_uchar4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rte(double4 x) +{ + x = rint(x); + return convert_uchar4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtp(double4 x) +{ + x = ceil(x); + return convert_uchar4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_uchar4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_rtn(double4 x) +{ + x = floor(x); + return convert_uchar4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar4 convert_uchar4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_uchar4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtz(double8 x) +{ + return convert_uchar8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtz(double8 x) +{ + return convert_uchar8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rte(double8 x) +{ + x = rint(x); + return convert_uchar8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rte(double8 x) +{ + x = rint(x); + return convert_uchar8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtp(double8 x) +{ + x = ceil(x); + return convert_uchar8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_uchar8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_rtn(double8 x) +{ + x = floor(x); + return convert_uchar8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar8 convert_uchar8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_uchar8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtz(double16 x) +{ + return convert_uchar16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtz(double16 x) +{ + return convert_uchar16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rte(double16 x) +{ + x = rint(x); + return convert_uchar16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rte(double16 x) +{ + x = rint(x); + return convert_uchar16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtp(double16 x) +{ + x = ceil(x); + return convert_uchar16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_uchar16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_rtn(double16 x) +{ + x = floor(x); + return convert_uchar16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uchar16 convert_uchar16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_uchar16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtz(double x) +{ + return convert_short(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtz(double x) +{ + return convert_short_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rte(double x) +{ + x = rint(x); + return convert_short(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rte(double x) +{ + x = rint(x); + return convert_short_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtp(double x) +{ + x = ceil(x); + return convert_short(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtp(double x) +{ + x = ceil(x); + return convert_short_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_rtn(double x) +{ + x = floor(x); + return convert_short(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short convert_short_sat_rtn(double x) +{ + x = floor(x); + return convert_short_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtz(double2 x) +{ + return convert_short2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtz(double2 x) +{ + return convert_short2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rte(double2 x) +{ + x = rint(x); + return convert_short2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rte(double2 x) +{ + x = rint(x); + return convert_short2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtp(double2 x) +{ + x = ceil(x); + return convert_short2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_short2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_rtn(double2 x) +{ + x = floor(x); + return convert_short2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short2 convert_short2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_short2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtz(double3 x) +{ + return convert_short3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtz(double3 x) +{ + return convert_short3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rte(double3 x) +{ + x = rint(x); + return convert_short3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rte(double3 x) +{ + x = rint(x); + return convert_short3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtp(double3 x) +{ + x = ceil(x); + return convert_short3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_short3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_rtn(double3 x) +{ + x = floor(x); + return convert_short3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short3 convert_short3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_short3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtz(double4 x) +{ + return convert_short4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtz(double4 x) +{ + return convert_short4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rte(double4 x) +{ + x = rint(x); + return convert_short4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rte(double4 x) +{ + x = rint(x); + return convert_short4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtp(double4 x) +{ + x = ceil(x); + return convert_short4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_short4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_rtn(double4 x) +{ + x = floor(x); + return convert_short4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short4 convert_short4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_short4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtz(double8 x) +{ + return convert_short8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtz(double8 x) +{ + return convert_short8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rte(double8 x) +{ + x = rint(x); + return convert_short8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rte(double8 x) +{ + x = rint(x); + return convert_short8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtp(double8 x) +{ + x = ceil(x); + return convert_short8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_short8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_rtn(double8 x) +{ + x = floor(x); + return convert_short8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short8 convert_short8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_short8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtz(double16 x) +{ + return convert_short16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtz(double16 x) +{ + return convert_short16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rte(double16 x) +{ + x = rint(x); + return convert_short16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rte(double16 x) +{ + x = rint(x); + return convert_short16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtp(double16 x) +{ + x = ceil(x); + return convert_short16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_short16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_rtn(double16 x) +{ + x = floor(x); + return convert_short16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +short16 convert_short16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_short16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtz(double x) +{ + return convert_ushort(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtz(double x) +{ + return convert_ushort_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rte(double x) +{ + x = rint(x); + return convert_ushort(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rte(double x) +{ + x = rint(x); + return convert_ushort_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtp(double x) +{ + x = ceil(x); + return convert_ushort(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtp(double x) +{ + x = ceil(x); + return convert_ushort_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_rtn(double x) +{ + x = floor(x); + return convert_ushort(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort convert_ushort_sat_rtn(double x) +{ + x = floor(x); + return convert_ushort_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtz(double2 x) +{ + return convert_ushort2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtz(double2 x) +{ + return convert_ushort2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rte(double2 x) +{ + x = rint(x); + return convert_ushort2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rte(double2 x) +{ + x = rint(x); + return convert_ushort2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtp(double2 x) +{ + x = ceil(x); + return convert_ushort2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_ushort2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_rtn(double2 x) +{ + x = floor(x); + return convert_ushort2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort2 convert_ushort2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_ushort2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtz(double3 x) +{ + return convert_ushort3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtz(double3 x) +{ + return convert_ushort3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rte(double3 x) +{ + x = rint(x); + return convert_ushort3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rte(double3 x) +{ + x = rint(x); + return convert_ushort3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtp(double3 x) +{ + x = ceil(x); + return convert_ushort3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_ushort3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_rtn(double3 x) +{ + x = floor(x); + return convert_ushort3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort3 convert_ushort3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_ushort3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtz(double4 x) +{ + return convert_ushort4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtz(double4 x) +{ + return convert_ushort4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rte(double4 x) +{ + x = rint(x); + return convert_ushort4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rte(double4 x) +{ + x = rint(x); + return convert_ushort4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtp(double4 x) +{ + x = ceil(x); + return convert_ushort4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_ushort4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_rtn(double4 x) +{ + x = floor(x); + return convert_ushort4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort4 convert_ushort4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_ushort4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtz(double8 x) +{ + return convert_ushort8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtz(double8 x) +{ + return convert_ushort8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rte(double8 x) +{ + x = rint(x); + return convert_ushort8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rte(double8 x) +{ + x = rint(x); + return convert_ushort8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtp(double8 x) +{ + x = ceil(x); + return convert_ushort8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_ushort8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_rtn(double8 x) +{ + x = floor(x); + return convert_ushort8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort8 convert_ushort8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_ushort8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtz(double16 x) +{ + return convert_ushort16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtz(double16 x) +{ + return convert_ushort16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rte(double16 x) +{ + x = rint(x); + return convert_ushort16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rte(double16 x) +{ + x = rint(x); + return convert_ushort16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtp(double16 x) +{ + x = ceil(x); + return convert_ushort16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_ushort16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_rtn(double16 x) +{ + x = floor(x); + return convert_ushort16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +ushort16 convert_ushort16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_ushort16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtz(double x) +{ + return convert_int(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtz(double x) +{ + return convert_int_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rte(double x) +{ + x = rint(x); + return convert_int(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rte(double x) +{ + x = rint(x); + return convert_int_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtp(double x) +{ + x = ceil(x); + return convert_int(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtp(double x) +{ + x = ceil(x); + return convert_int_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_rtn(double x) +{ + x = floor(x); + return convert_int(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int convert_int_sat_rtn(double x) +{ + x = floor(x); + return convert_int_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtz(double2 x) +{ + return convert_int2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtz(double2 x) +{ + return convert_int2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rte(double2 x) +{ + x = rint(x); + return convert_int2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rte(double2 x) +{ + x = rint(x); + return convert_int2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtp(double2 x) +{ + x = ceil(x); + return convert_int2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_int2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_rtn(double2 x) +{ + x = floor(x); + return convert_int2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int2 convert_int2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_int2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtz(double3 x) +{ + return convert_int3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtz(double3 x) +{ + return convert_int3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rte(double3 x) +{ + x = rint(x); + return convert_int3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rte(double3 x) +{ + x = rint(x); + return convert_int3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtp(double3 x) +{ + x = ceil(x); + return convert_int3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_int3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_rtn(double3 x) +{ + x = floor(x); + return convert_int3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int3 convert_int3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_int3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtz(double4 x) +{ + return convert_int4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtz(double4 x) +{ + return convert_int4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rte(double4 x) +{ + x = rint(x); + return convert_int4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rte(double4 x) +{ + x = rint(x); + return convert_int4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtp(double4 x) +{ + x = ceil(x); + return convert_int4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_int4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_rtn(double4 x) +{ + x = floor(x); + return convert_int4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int4 convert_int4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_int4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtz(double8 x) +{ + return convert_int8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtz(double8 x) +{ + return convert_int8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rte(double8 x) +{ + x = rint(x); + return convert_int8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rte(double8 x) +{ + x = rint(x); + return convert_int8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtp(double8 x) +{ + x = ceil(x); + return convert_int8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_int8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_rtn(double8 x) +{ + x = floor(x); + return convert_int8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int8 convert_int8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_int8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtz(double16 x) +{ + return convert_int16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtz(double16 x) +{ + return convert_int16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rte(double16 x) +{ + x = rint(x); + return convert_int16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rte(double16 x) +{ + x = rint(x); + return convert_int16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtp(double16 x) +{ + x = ceil(x); + return convert_int16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_int16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_rtn(double16 x) +{ + x = floor(x); + return convert_int16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +int16 convert_int16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_int16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtz(double x) +{ + return convert_uint(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtz(double x) +{ + return convert_uint_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rte(double x) +{ + x = rint(x); + return convert_uint(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rte(double x) +{ + x = rint(x); + return convert_uint_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtp(double x) +{ + x = ceil(x); + return convert_uint(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtp(double x) +{ + x = ceil(x); + return convert_uint_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_rtn(double x) +{ + x = floor(x); + return convert_uint(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint convert_uint_sat_rtn(double x) +{ + x = floor(x); + return convert_uint_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtz(double2 x) +{ + return convert_uint2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtz(double2 x) +{ + return convert_uint2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rte(double2 x) +{ + x = rint(x); + return convert_uint2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rte(double2 x) +{ + x = rint(x); + return convert_uint2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtp(double2 x) +{ + x = ceil(x); + return convert_uint2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_uint2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_rtn(double2 x) +{ + x = floor(x); + return convert_uint2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint2 convert_uint2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_uint2_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtz(double3 x) +{ + return convert_uint3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtz(double3 x) +{ + return convert_uint3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rte(double3 x) +{ + x = rint(x); + return convert_uint3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rte(double3 x) +{ + x = rint(x); + return convert_uint3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtp(double3 x) +{ + x = ceil(x); + return convert_uint3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_uint3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_rtn(double3 x) +{ + x = floor(x); + return convert_uint3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint3 convert_uint3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_uint3_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtz(double4 x) +{ + return convert_uint4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtz(double4 x) +{ + return convert_uint4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rte(double4 x) +{ + x = rint(x); + return convert_uint4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rte(double4 x) +{ + x = rint(x); + return convert_uint4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtp(double4 x) +{ + x = ceil(x); + return convert_uint4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_uint4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_rtn(double4 x) +{ + x = floor(x); + return convert_uint4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint4 convert_uint4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_uint4_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtz(double8 x) +{ + return convert_uint8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtz(double8 x) +{ + return convert_uint8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rte(double8 x) +{ + x = rint(x); + return convert_uint8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rte(double8 x) +{ + x = rint(x); + return convert_uint8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtp(double8 x) +{ + x = ceil(x); + return convert_uint8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_uint8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_rtn(double8 x) +{ + x = floor(x); + return convert_uint8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint8 convert_uint8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_uint8_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtz(double16 x) +{ + return convert_uint16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtz(double16 x) +{ + return convert_uint16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rte(double16 x) +{ + x = rint(x); + return convert_uint16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rte(double16 x) +{ + x = rint(x); + return convert_uint16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtp(double16 x) +{ + x = ceil(x); + return convert_uint16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_uint16_sat(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_rtn(double16 x) +{ + x = floor(x); + return convert_uint16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +uint16 convert_uint16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_uint16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtz(double x) +{ + return convert_long(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtz(double x) +{ + return convert_long_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_rte(double x) +{ + x = rint(x); + return convert_long(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rte(double x) +{ + x = rint(x); + return convert_long_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtp(double x) +{ + x = ceil(x); + return convert_long(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtp(double x) +{ + x = ceil(x); + return convert_long_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_rtn(double x) +{ + x = floor(x); + return convert_long(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long convert_long_sat_rtn(double x) +{ + x = floor(x); + return convert_long_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtz(double2 x) +{ + return convert_long2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtz(double2 x) +{ + return convert_long2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rte(double2 x) +{ + x = rint(x); + return convert_long2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rte(double2 x) +{ + x = rint(x); + return convert_long2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtp(double2 x) +{ + x = ceil(x); + return convert_long2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_long2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_rtn(double2 x) +{ + x = floor(x); + return convert_long2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long2 convert_long2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_long2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtz(double3 x) +{ + return convert_long3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtz(double3 x) +{ + return convert_long3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rte(double3 x) +{ + x = rint(x); + return convert_long3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rte(double3 x) +{ + x = rint(x); + return convert_long3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtp(double3 x) +{ + x = ceil(x); + return convert_long3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_long3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_rtn(double3 x) +{ + x = floor(x); + return convert_long3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long3 convert_long3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_long3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtz(double4 x) +{ + return convert_long4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtz(double4 x) +{ + return convert_long4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rte(double4 x) +{ + x = rint(x); + return convert_long4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rte(double4 x) +{ + x = rint(x); + return convert_long4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtp(double4 x) +{ + x = ceil(x); + return convert_long4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_long4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_rtn(double4 x) +{ + x = floor(x); + return convert_long4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long4 convert_long4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_long4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtz(double8 x) +{ + return convert_long8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtz(double8 x) +{ + return convert_long8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rte(double8 x) +{ + x = rint(x); + return convert_long8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rte(double8 x) +{ + x = rint(x); + return convert_long8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtp(double8 x) +{ + x = ceil(x); + return convert_long8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_long8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_rtn(double8 x) +{ + x = floor(x); + return convert_long8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long8 convert_long8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_long8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtz(double16 x) +{ + return convert_long16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtz(double16 x) +{ + return convert_long16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rte(double16 x) +{ + x = rint(x); + return convert_long16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rte(double16 x) +{ + x = rint(x); + return convert_long16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtp(double16 x) +{ + x = ceil(x); + return convert_long16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_long16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_rtn(double16 x) +{ + x = floor(x); + return convert_long16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +long16 convert_long16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_long16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtz(double x) +{ + return convert_ulong(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtz(double x) +{ + return convert_ulong_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rte(double x) +{ + x = rint(x); + return convert_ulong(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rte(double x) +{ + x = rint(x); + return convert_ulong_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtp(double x) +{ + x = ceil(x); + return convert_ulong(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtp(double x) +{ + x = ceil(x); + return convert_ulong_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_rtn(double x) +{ + x = floor(x); + return convert_ulong(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong convert_ulong_sat_rtn(double x) +{ + x = floor(x); + return convert_ulong_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtz(double2 x) +{ + return convert_ulong2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtz(double2 x) +{ + return convert_ulong2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rte(double2 x) +{ + x = rint(x); + return convert_ulong2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rte(double2 x) +{ + x = rint(x); + return convert_ulong2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtp(double2 x) +{ + x = ceil(x); + return convert_ulong2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtp(double2 x) +{ + x = ceil(x); + return convert_ulong2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_rtn(double2 x) +{ + x = floor(x); + return convert_ulong2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong2 convert_ulong2_sat_rtn(double2 x) +{ + x = floor(x); + return convert_ulong2_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtz(double3 x) +{ + return convert_ulong3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtz(double3 x) +{ + return convert_ulong3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rte(double3 x) +{ + x = rint(x); + return convert_ulong3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rte(double3 x) +{ + x = rint(x); + return convert_ulong3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtp(double3 x) +{ + x = ceil(x); + return convert_ulong3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtp(double3 x) +{ + x = ceil(x); + return convert_ulong3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_rtn(double3 x) +{ + x = floor(x); + return convert_ulong3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong3 convert_ulong3_sat_rtn(double3 x) +{ + x = floor(x); + return convert_ulong3_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtz(double4 x) +{ + return convert_ulong4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtz(double4 x) +{ + return convert_ulong4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rte(double4 x) +{ + x = rint(x); + return convert_ulong4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rte(double4 x) +{ + x = rint(x); + return convert_ulong4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtp(double4 x) +{ + x = ceil(x); + return convert_ulong4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtp(double4 x) +{ + x = ceil(x); + return convert_ulong4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_rtn(double4 x) +{ + x = floor(x); + return convert_ulong4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong4 convert_ulong4_sat_rtn(double4 x) +{ + x = floor(x); + return convert_ulong4_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtz(double8 x) +{ + return convert_ulong8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtz(double8 x) +{ + return convert_ulong8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rte(double8 x) +{ + x = rint(x); + return convert_ulong8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rte(double8 x) +{ + x = rint(x); + return convert_ulong8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtp(double8 x) +{ + x = ceil(x); + return convert_ulong8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtp(double8 x) +{ + x = ceil(x); + return convert_ulong8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_rtn(double8 x) +{ + x = floor(x); + return convert_ulong8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong8 convert_ulong8_sat_rtn(double8 x) +{ + x = floor(x); + return convert_ulong8_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtz(double16 x) +{ + return convert_ulong16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtz(double16 x) +{ + return convert_ulong16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rte(double16 x) +{ + x = rint(x); + return convert_ulong16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rte(double16 x) +{ + x = rint(x); + return convert_ulong16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtp(double16 x) +{ + x = ceil(x); + return convert_ulong16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtp(double16 x) +{ + x = ceil(x); + return convert_ulong16_sat(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_rtn(double16 x) +{ + x = floor(x); + return convert_ulong16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +ulong16 convert_ulong16_sat_rtn(double16 x) +{ + x = floor(x); + return convert_ulong16_sat(x); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(char x) +{ + float r = convert_float(x); + char y = convert_char(y); + uchar abs_x = abs(x); + uchar abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(char x) +{ + return convert_float(x); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(char x) +{ + float r = convert_float(x); + char y = convert_char(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(char x) +{ + float r = convert_float(x); + char y = convert_char(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(char2 x) +{ + float2 r = convert_float2(x); + char2 y = convert_char2(y); + uchar2 abs_x = abs(x); + uchar2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(char2 x) +{ + return convert_float2(x); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(char2 x) +{ + float2 r = convert_float2(x); + char2 y = convert_char2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(char2 x) +{ + float2 r = convert_float2(x); + char2 y = convert_char2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(char3 x) +{ + float3 r = convert_float3(x); + char3 y = convert_char3(y); + uchar3 abs_x = abs(x); + uchar3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(char3 x) +{ + return convert_float3(x); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(char3 x) +{ + float3 r = convert_float3(x); + char3 y = convert_char3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(char3 x) +{ + float3 r = convert_float3(x); + char3 y = convert_char3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(char4 x) +{ + float4 r = convert_float4(x); + char4 y = convert_char4(y); + uchar4 abs_x = abs(x); + uchar4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(char4 x) +{ + return convert_float4(x); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(char4 x) +{ + float4 r = convert_float4(x); + char4 y = convert_char4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(char4 x) +{ + float4 r = convert_float4(x); + char4 y = convert_char4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(char8 x) +{ + float8 r = convert_float8(x); + char8 y = convert_char8(y); + uchar8 abs_x = abs(x); + uchar8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(char8 x) +{ + return convert_float8(x); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(char8 x) +{ + float8 r = convert_float8(x); + char8 y = convert_char8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(char8 x) +{ + float8 r = convert_float8(x); + char8 y = convert_char8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(char16 x) +{ + float16 r = convert_float16(x); + char16 y = convert_char16(y); + uchar16 abs_x = abs(x); + uchar16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(char16 x) +{ + return convert_float16(x); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(char16 x) +{ + float16 r = convert_float16(x); + char16 y = convert_char16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(char16 x) +{ + float16 r = convert_float16(x); + char16 y = convert_char16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(char x) +{ + double r = convert_double(x); + char y = convert_char(y); + uchar abs_x = abs(x); + uchar abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(char x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(char x) +{ + double r = convert_double(x); + char y = convert_char(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(char x) +{ + double r = convert_double(x); + char y = convert_char(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(char2 x) +{ + double2 r = convert_double2(x); + char2 y = convert_char2(y); + uchar2 abs_x = abs(x); + uchar2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(char2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(char2 x) +{ + double2 r = convert_double2(x); + char2 y = convert_char2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(char2 x) +{ + double2 r = convert_double2(x); + char2 y = convert_char2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(char3 x) +{ + double3 r = convert_double3(x); + char3 y = convert_char3(y); + uchar3 abs_x = abs(x); + uchar3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(char3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(char3 x) +{ + double3 r = convert_double3(x); + char3 y = convert_char3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(char3 x) +{ + double3 r = convert_double3(x); + char3 y = convert_char3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(char4 x) +{ + double4 r = convert_double4(x); + char4 y = convert_char4(y); + uchar4 abs_x = abs(x); + uchar4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(char4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(char4 x) +{ + double4 r = convert_double4(x); + char4 y = convert_char4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(char4 x) +{ + double4 r = convert_double4(x); + char4 y = convert_char4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(char8 x) +{ + double8 r = convert_double8(x); + char8 y = convert_char8(y); + uchar8 abs_x = abs(x); + uchar8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(char8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(char8 x) +{ + double8 r = convert_double8(x); + char8 y = convert_char8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(char8 x) +{ + double8 r = convert_double8(x); + char8 y = convert_char8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(char16 x) +{ + double16 r = convert_double16(x); + char16 y = convert_char16(y); + uchar16 abs_x = abs(x); + uchar16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(char16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(char16 x) +{ + double16 r = convert_double16(x); + char16 y = convert_char16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(char16 x) +{ + double16 r = convert_double16(x); + char16 y = convert_char16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(uchar x) +{ + float r = convert_float(x); + uchar y = convert_uchar(y); + uchar abs_x = abs(x); + uchar abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(uchar x) +{ + return convert_float(x); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(uchar x) +{ + float r = convert_float(x); + uchar y = convert_uchar(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(uchar x) +{ + float r = convert_float(x); + uchar y = convert_uchar(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(uchar2 x) +{ + float2 r = convert_float2(x); + uchar2 y = convert_uchar2(y); + uchar2 abs_x = abs(x); + uchar2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(uchar2 x) +{ + return convert_float2(x); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(uchar2 x) +{ + float2 r = convert_float2(x); + uchar2 y = convert_uchar2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(uchar2 x) +{ + float2 r = convert_float2(x); + uchar2 y = convert_uchar2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(uchar3 x) +{ + float3 r = convert_float3(x); + uchar3 y = convert_uchar3(y); + uchar3 abs_x = abs(x); + uchar3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(uchar3 x) +{ + return convert_float3(x); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(uchar3 x) +{ + float3 r = convert_float3(x); + uchar3 y = convert_uchar3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(uchar3 x) +{ + float3 r = convert_float3(x); + uchar3 y = convert_uchar3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(uchar4 x) +{ + float4 r = convert_float4(x); + uchar4 y = convert_uchar4(y); + uchar4 abs_x = abs(x); + uchar4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(uchar4 x) +{ + return convert_float4(x); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(uchar4 x) +{ + float4 r = convert_float4(x); + uchar4 y = convert_uchar4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(uchar4 x) +{ + float4 r = convert_float4(x); + uchar4 y = convert_uchar4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(uchar8 x) +{ + float8 r = convert_float8(x); + uchar8 y = convert_uchar8(y); + uchar8 abs_x = abs(x); + uchar8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(uchar8 x) +{ + return convert_float8(x); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(uchar8 x) +{ + float8 r = convert_float8(x); + uchar8 y = convert_uchar8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(uchar8 x) +{ + float8 r = convert_float8(x); + uchar8 y = convert_uchar8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(uchar16 x) +{ + float16 r = convert_float16(x); + uchar16 y = convert_uchar16(y); + uchar16 abs_x = abs(x); + uchar16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(uchar16 x) +{ + return convert_float16(x); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(uchar16 x) +{ + float16 r = convert_float16(x); + uchar16 y = convert_uchar16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(uchar16 x) +{ + float16 r = convert_float16(x); + uchar16 y = convert_uchar16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(uchar x) +{ + double r = convert_double(x); + uchar y = convert_uchar(y); + uchar abs_x = abs(x); + uchar abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(uchar x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(uchar x) +{ + double r = convert_double(x); + uchar y = convert_uchar(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(uchar x) +{ + double r = convert_double(x); + uchar y = convert_uchar(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(uchar2 x) +{ + double2 r = convert_double2(x); + uchar2 y = convert_uchar2(y); + uchar2 abs_x = abs(x); + uchar2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(uchar2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(uchar2 x) +{ + double2 r = convert_double2(x); + uchar2 y = convert_uchar2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(uchar2 x) +{ + double2 r = convert_double2(x); + uchar2 y = convert_uchar2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(uchar3 x) +{ + double3 r = convert_double3(x); + uchar3 y = convert_uchar3(y); + uchar3 abs_x = abs(x); + uchar3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(uchar3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(uchar3 x) +{ + double3 r = convert_double3(x); + uchar3 y = convert_uchar3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(uchar3 x) +{ + double3 r = convert_double3(x); + uchar3 y = convert_uchar3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(uchar4 x) +{ + double4 r = convert_double4(x); + uchar4 y = convert_uchar4(y); + uchar4 abs_x = abs(x); + uchar4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(uchar4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(uchar4 x) +{ + double4 r = convert_double4(x); + uchar4 y = convert_uchar4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(uchar4 x) +{ + double4 r = convert_double4(x); + uchar4 y = convert_uchar4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(uchar8 x) +{ + double8 r = convert_double8(x); + uchar8 y = convert_uchar8(y); + uchar8 abs_x = abs(x); + uchar8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(uchar8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(uchar8 x) +{ + double8 r = convert_double8(x); + uchar8 y = convert_uchar8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(uchar8 x) +{ + double8 r = convert_double8(x); + uchar8 y = convert_uchar8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(uchar16 x) +{ + double16 r = convert_double16(x); + uchar16 y = convert_uchar16(y); + uchar16 abs_x = abs(x); + uchar16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(uchar16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(uchar16 x) +{ + double16 r = convert_double16(x); + uchar16 y = convert_uchar16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(uchar16 x) +{ + double16 r = convert_double16(x); + uchar16 y = convert_uchar16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(short x) +{ + float r = convert_float(x); + short y = convert_short(y); + ushort abs_x = abs(x); + ushort abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(short x) +{ + return convert_float(x); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(short x) +{ + float r = convert_float(x); + short y = convert_short(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(short x) +{ + float r = convert_float(x); + short y = convert_short(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(short2 x) +{ + float2 r = convert_float2(x); + short2 y = convert_short2(y); + ushort2 abs_x = abs(x); + ushort2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(short2 x) +{ + return convert_float2(x); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(short2 x) +{ + float2 r = convert_float2(x); + short2 y = convert_short2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(short2 x) +{ + float2 r = convert_float2(x); + short2 y = convert_short2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(short3 x) +{ + float3 r = convert_float3(x); + short3 y = convert_short3(y); + ushort3 abs_x = abs(x); + ushort3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(short3 x) +{ + return convert_float3(x); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(short3 x) +{ + float3 r = convert_float3(x); + short3 y = convert_short3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(short3 x) +{ + float3 r = convert_float3(x); + short3 y = convert_short3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(short4 x) +{ + float4 r = convert_float4(x); + short4 y = convert_short4(y); + ushort4 abs_x = abs(x); + ushort4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(short4 x) +{ + return convert_float4(x); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(short4 x) +{ + float4 r = convert_float4(x); + short4 y = convert_short4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(short4 x) +{ + float4 r = convert_float4(x); + short4 y = convert_short4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(short8 x) +{ + float8 r = convert_float8(x); + short8 y = convert_short8(y); + ushort8 abs_x = abs(x); + ushort8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(short8 x) +{ + return convert_float8(x); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(short8 x) +{ + float8 r = convert_float8(x); + short8 y = convert_short8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(short8 x) +{ + float8 r = convert_float8(x); + short8 y = convert_short8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(short16 x) +{ + float16 r = convert_float16(x); + short16 y = convert_short16(y); + ushort16 abs_x = abs(x); + ushort16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(short16 x) +{ + return convert_float16(x); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(short16 x) +{ + float16 r = convert_float16(x); + short16 y = convert_short16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(short16 x) +{ + float16 r = convert_float16(x); + short16 y = convert_short16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(short x) +{ + double r = convert_double(x); + short y = convert_short(y); + ushort abs_x = abs(x); + ushort abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(short x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(short x) +{ + double r = convert_double(x); + short y = convert_short(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(short x) +{ + double r = convert_double(x); + short y = convert_short(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(short2 x) +{ + double2 r = convert_double2(x); + short2 y = convert_short2(y); + ushort2 abs_x = abs(x); + ushort2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(short2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(short2 x) +{ + double2 r = convert_double2(x); + short2 y = convert_short2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(short2 x) +{ + double2 r = convert_double2(x); + short2 y = convert_short2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(short3 x) +{ + double3 r = convert_double3(x); + short3 y = convert_short3(y); + ushort3 abs_x = abs(x); + ushort3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(short3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(short3 x) +{ + double3 r = convert_double3(x); + short3 y = convert_short3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(short3 x) +{ + double3 r = convert_double3(x); + short3 y = convert_short3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(short4 x) +{ + double4 r = convert_double4(x); + short4 y = convert_short4(y); + ushort4 abs_x = abs(x); + ushort4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(short4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(short4 x) +{ + double4 r = convert_double4(x); + short4 y = convert_short4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(short4 x) +{ + double4 r = convert_double4(x); + short4 y = convert_short4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(short8 x) +{ + double8 r = convert_double8(x); + short8 y = convert_short8(y); + ushort8 abs_x = abs(x); + ushort8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(short8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(short8 x) +{ + double8 r = convert_double8(x); + short8 y = convert_short8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(short8 x) +{ + double8 r = convert_double8(x); + short8 y = convert_short8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(short16 x) +{ + double16 r = convert_double16(x); + short16 y = convert_short16(y); + ushort16 abs_x = abs(x); + ushort16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(short16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(short16 x) +{ + double16 r = convert_double16(x); + short16 y = convert_short16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(short16 x) +{ + double16 r = convert_double16(x); + short16 y = convert_short16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(ushort x) +{ + float r = convert_float(x); + ushort y = convert_ushort(y); + ushort abs_x = abs(x); + ushort abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(ushort x) +{ + return convert_float(x); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(ushort x) +{ + float r = convert_float(x); + ushort y = convert_ushort(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(ushort x) +{ + float r = convert_float(x); + ushort y = convert_ushort(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(ushort2 x) +{ + float2 r = convert_float2(x); + ushort2 y = convert_ushort2(y); + ushort2 abs_x = abs(x); + ushort2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(ushort2 x) +{ + return convert_float2(x); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(ushort2 x) +{ + float2 r = convert_float2(x); + ushort2 y = convert_ushort2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(ushort2 x) +{ + float2 r = convert_float2(x); + ushort2 y = convert_ushort2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(ushort3 x) +{ + float3 r = convert_float3(x); + ushort3 y = convert_ushort3(y); + ushort3 abs_x = abs(x); + ushort3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(ushort3 x) +{ + return convert_float3(x); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(ushort3 x) +{ + float3 r = convert_float3(x); + ushort3 y = convert_ushort3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(ushort3 x) +{ + float3 r = convert_float3(x); + ushort3 y = convert_ushort3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(ushort4 x) +{ + float4 r = convert_float4(x); + ushort4 y = convert_ushort4(y); + ushort4 abs_x = abs(x); + ushort4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(ushort4 x) +{ + return convert_float4(x); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(ushort4 x) +{ + float4 r = convert_float4(x); + ushort4 y = convert_ushort4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(ushort4 x) +{ + float4 r = convert_float4(x); + ushort4 y = convert_ushort4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(ushort8 x) +{ + float8 r = convert_float8(x); + ushort8 y = convert_ushort8(y); + ushort8 abs_x = abs(x); + ushort8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(ushort8 x) +{ + return convert_float8(x); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(ushort8 x) +{ + float8 r = convert_float8(x); + ushort8 y = convert_ushort8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(ushort8 x) +{ + float8 r = convert_float8(x); + ushort8 y = convert_ushort8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(ushort16 x) +{ + float16 r = convert_float16(x); + ushort16 y = convert_ushort16(y); + ushort16 abs_x = abs(x); + ushort16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(ushort16 x) +{ + return convert_float16(x); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(ushort16 x) +{ + float16 r = convert_float16(x); + ushort16 y = convert_ushort16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(ushort16 x) +{ + float16 r = convert_float16(x); + ushort16 y = convert_ushort16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(ushort x) +{ + double r = convert_double(x); + ushort y = convert_ushort(y); + ushort abs_x = abs(x); + ushort abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(ushort x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(ushort x) +{ + double r = convert_double(x); + ushort y = convert_ushort(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(ushort x) +{ + double r = convert_double(x); + ushort y = convert_ushort(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(ushort2 x) +{ + double2 r = convert_double2(x); + ushort2 y = convert_ushort2(y); + ushort2 abs_x = abs(x); + ushort2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(ushort2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(ushort2 x) +{ + double2 r = convert_double2(x); + ushort2 y = convert_ushort2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(ushort2 x) +{ + double2 r = convert_double2(x); + ushort2 y = convert_ushort2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(ushort3 x) +{ + double3 r = convert_double3(x); + ushort3 y = convert_ushort3(y); + ushort3 abs_x = abs(x); + ushort3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(ushort3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(ushort3 x) +{ + double3 r = convert_double3(x); + ushort3 y = convert_ushort3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(ushort3 x) +{ + double3 r = convert_double3(x); + ushort3 y = convert_ushort3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(ushort4 x) +{ + double4 r = convert_double4(x); + ushort4 y = convert_ushort4(y); + ushort4 abs_x = abs(x); + ushort4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(ushort4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(ushort4 x) +{ + double4 r = convert_double4(x); + ushort4 y = convert_ushort4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(ushort4 x) +{ + double4 r = convert_double4(x); + ushort4 y = convert_ushort4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(ushort8 x) +{ + double8 r = convert_double8(x); + ushort8 y = convert_ushort8(y); + ushort8 abs_x = abs(x); + ushort8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(ushort8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(ushort8 x) +{ + double8 r = convert_double8(x); + ushort8 y = convert_ushort8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(ushort8 x) +{ + double8 r = convert_double8(x); + ushort8 y = convert_ushort8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(ushort16 x) +{ + double16 r = convert_double16(x); + ushort16 y = convert_ushort16(y); + ushort16 abs_x = abs(x); + ushort16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(ushort16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(ushort16 x) +{ + double16 r = convert_double16(x); + ushort16 y = convert_ushort16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(ushort16 x) +{ + double16 r = convert_double16(x); + ushort16 y = convert_ushort16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(int x) +{ + float r = convert_float(x); + int y = convert_int(y); + uint abs_x = abs(x); + uint abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(int x) +{ + return convert_float(x); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(int x) +{ + float r = convert_float(x); + int y = convert_int(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(int x) +{ + float r = convert_float(x); + int y = convert_int(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(int2 x) +{ + float2 r = convert_float2(x); + int2 y = convert_int2(y); + uint2 abs_x = abs(x); + uint2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(int2 x) +{ + return convert_float2(x); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(int2 x) +{ + float2 r = convert_float2(x); + int2 y = convert_int2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(int2 x) +{ + float2 r = convert_float2(x); + int2 y = convert_int2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(int3 x) +{ + float3 r = convert_float3(x); + int3 y = convert_int3(y); + uint3 abs_x = abs(x); + uint3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(int3 x) +{ + return convert_float3(x); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(int3 x) +{ + float3 r = convert_float3(x); + int3 y = convert_int3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(int3 x) +{ + float3 r = convert_float3(x); + int3 y = convert_int3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(int4 x) +{ + float4 r = convert_float4(x); + int4 y = convert_int4(y); + uint4 abs_x = abs(x); + uint4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(int4 x) +{ + return convert_float4(x); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(int4 x) +{ + float4 r = convert_float4(x); + int4 y = convert_int4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(int4 x) +{ + float4 r = convert_float4(x); + int4 y = convert_int4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(int8 x) +{ + float8 r = convert_float8(x); + int8 y = convert_int8(y); + uint8 abs_x = abs(x); + uint8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(int8 x) +{ + return convert_float8(x); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(int8 x) +{ + float8 r = convert_float8(x); + int8 y = convert_int8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(int8 x) +{ + float8 r = convert_float8(x); + int8 y = convert_int8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(int16 x) +{ + float16 r = convert_float16(x); + int16 y = convert_int16(y); + uint16 abs_x = abs(x); + uint16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(int16 x) +{ + return convert_float16(x); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(int16 x) +{ + float16 r = convert_float16(x); + int16 y = convert_int16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(int16 x) +{ + float16 r = convert_float16(x); + int16 y = convert_int16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(int x) +{ + double r = convert_double(x); + int y = convert_int(y); + uint abs_x = abs(x); + uint abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(int x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(int x) +{ + double r = convert_double(x); + int y = convert_int(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(int x) +{ + double r = convert_double(x); + int y = convert_int(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(int2 x) +{ + double2 r = convert_double2(x); + int2 y = convert_int2(y); + uint2 abs_x = abs(x); + uint2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(int2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(int2 x) +{ + double2 r = convert_double2(x); + int2 y = convert_int2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(int2 x) +{ + double2 r = convert_double2(x); + int2 y = convert_int2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(int3 x) +{ + double3 r = convert_double3(x); + int3 y = convert_int3(y); + uint3 abs_x = abs(x); + uint3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(int3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(int3 x) +{ + double3 r = convert_double3(x); + int3 y = convert_int3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(int3 x) +{ + double3 r = convert_double3(x); + int3 y = convert_int3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(int4 x) +{ + double4 r = convert_double4(x); + int4 y = convert_int4(y); + uint4 abs_x = abs(x); + uint4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(int4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(int4 x) +{ + double4 r = convert_double4(x); + int4 y = convert_int4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(int4 x) +{ + double4 r = convert_double4(x); + int4 y = convert_int4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(int8 x) +{ + double8 r = convert_double8(x); + int8 y = convert_int8(y); + uint8 abs_x = abs(x); + uint8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(int8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(int8 x) +{ + double8 r = convert_double8(x); + int8 y = convert_int8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(int8 x) +{ + double8 r = convert_double8(x); + int8 y = convert_int8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(int16 x) +{ + double16 r = convert_double16(x); + int16 y = convert_int16(y); + uint16 abs_x = abs(x); + uint16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(int16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(int16 x) +{ + double16 r = convert_double16(x); + int16 y = convert_int16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(int16 x) +{ + double16 r = convert_double16(x); + int16 y = convert_int16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(uint x) +{ + float r = convert_float(x); + uint y = convert_uint(y); + uint abs_x = abs(x); + uint abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(uint x) +{ + return convert_float(x); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(uint x) +{ + float r = convert_float(x); + uint y = convert_uint(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(uint x) +{ + float r = convert_float(x); + uint y = convert_uint(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(uint2 x) +{ + float2 r = convert_float2(x); + uint2 y = convert_uint2(y); + uint2 abs_x = abs(x); + uint2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(uint2 x) +{ + return convert_float2(x); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(uint2 x) +{ + float2 r = convert_float2(x); + uint2 y = convert_uint2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(uint2 x) +{ + float2 r = convert_float2(x); + uint2 y = convert_uint2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(uint3 x) +{ + float3 r = convert_float3(x); + uint3 y = convert_uint3(y); + uint3 abs_x = abs(x); + uint3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(uint3 x) +{ + return convert_float3(x); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(uint3 x) +{ + float3 r = convert_float3(x); + uint3 y = convert_uint3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(uint3 x) +{ + float3 r = convert_float3(x); + uint3 y = convert_uint3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(uint4 x) +{ + float4 r = convert_float4(x); + uint4 y = convert_uint4(y); + uint4 abs_x = abs(x); + uint4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(uint4 x) +{ + return convert_float4(x); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(uint4 x) +{ + float4 r = convert_float4(x); + uint4 y = convert_uint4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(uint4 x) +{ + float4 r = convert_float4(x); + uint4 y = convert_uint4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(uint8 x) +{ + float8 r = convert_float8(x); + uint8 y = convert_uint8(y); + uint8 abs_x = abs(x); + uint8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(uint8 x) +{ + return convert_float8(x); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(uint8 x) +{ + float8 r = convert_float8(x); + uint8 y = convert_uint8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(uint8 x) +{ + float8 r = convert_float8(x); + uint8 y = convert_uint8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(uint16 x) +{ + float16 r = convert_float16(x); + uint16 y = convert_uint16(y); + uint16 abs_x = abs(x); + uint16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(uint16 x) +{ + return convert_float16(x); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(uint16 x) +{ + float16 r = convert_float16(x); + uint16 y = convert_uint16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(uint16 x) +{ + float16 r = convert_float16(x); + uint16 y = convert_uint16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(uint x) +{ + double r = convert_double(x); + uint y = convert_uint(y); + uint abs_x = abs(x); + uint abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(uint x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(uint x) +{ + double r = convert_double(x); + uint y = convert_uint(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(uint x) +{ + double r = convert_double(x); + uint y = convert_uint(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(uint2 x) +{ + double2 r = convert_double2(x); + uint2 y = convert_uint2(y); + uint2 abs_x = abs(x); + uint2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(uint2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(uint2 x) +{ + double2 r = convert_double2(x); + uint2 y = convert_uint2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(uint2 x) +{ + double2 r = convert_double2(x); + uint2 y = convert_uint2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(uint3 x) +{ + double3 r = convert_double3(x); + uint3 y = convert_uint3(y); + uint3 abs_x = abs(x); + uint3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(uint3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(uint3 x) +{ + double3 r = convert_double3(x); + uint3 y = convert_uint3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(uint3 x) +{ + double3 r = convert_double3(x); + uint3 y = convert_uint3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(uint4 x) +{ + double4 r = convert_double4(x); + uint4 y = convert_uint4(y); + uint4 abs_x = abs(x); + uint4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(uint4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(uint4 x) +{ + double4 r = convert_double4(x); + uint4 y = convert_uint4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(uint4 x) +{ + double4 r = convert_double4(x); + uint4 y = convert_uint4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(uint8 x) +{ + double8 r = convert_double8(x); + uint8 y = convert_uint8(y); + uint8 abs_x = abs(x); + uint8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(uint8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(uint8 x) +{ + double8 r = convert_double8(x); + uint8 y = convert_uint8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(uint8 x) +{ + double8 r = convert_double8(x); + uint8 y = convert_uint8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(uint16 x) +{ + double16 r = convert_double16(x); + uint16 y = convert_uint16(y); + uint16 abs_x = abs(x); + uint16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(uint16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(uint16 x) +{ + double16 r = convert_double16(x); + uint16 y = convert_uint16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(uint16 x) +{ + double16 r = convert_double16(x); + uint16 y = convert_uint16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(long x) +{ + float r = convert_float(x); + long y = convert_long(y); + ulong abs_x = abs(x); + ulong abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(long x) +{ + return convert_float(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(long x) +{ + float r = convert_float(x); + long y = convert_long(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(long x) +{ + float r = convert_float(x); + long y = convert_long(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(long2 x) +{ + float2 r = convert_float2(x); + long2 y = convert_long2(y); + ulong2 abs_x = abs(x); + ulong2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(long2 x) +{ + return convert_float2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(long2 x) +{ + float2 r = convert_float2(x); + long2 y = convert_long2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(long2 x) +{ + float2 r = convert_float2(x); + long2 y = convert_long2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(long3 x) +{ + float3 r = convert_float3(x); + long3 y = convert_long3(y); + ulong3 abs_x = abs(x); + ulong3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(long3 x) +{ + return convert_float3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(long3 x) +{ + float3 r = convert_float3(x); + long3 y = convert_long3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(long3 x) +{ + float3 r = convert_float3(x); + long3 y = convert_long3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(long4 x) +{ + float4 r = convert_float4(x); + long4 y = convert_long4(y); + ulong4 abs_x = abs(x); + ulong4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(long4 x) +{ + return convert_float4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(long4 x) +{ + float4 r = convert_float4(x); + long4 y = convert_long4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(long4 x) +{ + float4 r = convert_float4(x); + long4 y = convert_long4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(long8 x) +{ + float8 r = convert_float8(x); + long8 y = convert_long8(y); + ulong8 abs_x = abs(x); + ulong8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(long8 x) +{ + return convert_float8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(long8 x) +{ + float8 r = convert_float8(x); + long8 y = convert_long8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(long8 x) +{ + float8 r = convert_float8(x); + long8 y = convert_long8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(long16 x) +{ + float16 r = convert_float16(x); + long16 y = convert_long16(y); + ulong16 abs_x = abs(x); + ulong16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(long16 x) +{ + return convert_float16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(long16 x) +{ + float16 r = convert_float16(x); + long16 y = convert_long16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(long16 x) +{ + float16 r = convert_float16(x); + long16 y = convert_long16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(long x) +{ + double r = convert_double(x); + long y = convert_long(y); + ulong abs_x = abs(x); + ulong abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(long x) +{ + return convert_double(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(long x) +{ + double r = convert_double(x); + long y = convert_long(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(long x) +{ + double r = convert_double(x); + long y = convert_long(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(long2 x) +{ + double2 r = convert_double2(x); + long2 y = convert_long2(y); + ulong2 abs_x = abs(x); + ulong2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(long2 x) +{ + return convert_double2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(long2 x) +{ + double2 r = convert_double2(x); + long2 y = convert_long2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(long2 x) +{ + double2 r = convert_double2(x); + long2 y = convert_long2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(long3 x) +{ + double3 r = convert_double3(x); + long3 y = convert_long3(y); + ulong3 abs_x = abs(x); + ulong3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(long3 x) +{ + return convert_double3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(long3 x) +{ + double3 r = convert_double3(x); + long3 y = convert_long3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(long3 x) +{ + double3 r = convert_double3(x); + long3 y = convert_long3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(long4 x) +{ + double4 r = convert_double4(x); + long4 y = convert_long4(y); + ulong4 abs_x = abs(x); + ulong4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(long4 x) +{ + return convert_double4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(long4 x) +{ + double4 r = convert_double4(x); + long4 y = convert_long4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(long4 x) +{ + double4 r = convert_double4(x); + long4 y = convert_long4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(long8 x) +{ + double8 r = convert_double8(x); + long8 y = convert_long8(y); + ulong8 abs_x = abs(x); + ulong8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(long8 x) +{ + return convert_double8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(long8 x) +{ + double8 r = convert_double8(x); + long8 y = convert_long8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(long8 x) +{ + double8 r = convert_double8(x); + long8 y = convert_long8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(long16 x) +{ + double16 r = convert_double16(x); + long16 y = convert_long16(y); + ulong16 abs_x = abs(x); + ulong16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(long16 x) +{ + return convert_double16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(long16 x) +{ + double16 r = convert_double16(x); + long16 y = convert_long16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(long16 x) +{ + double16 r = convert_double16(x); + long16 y = convert_long16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(ulong x) +{ + float r = convert_float(x); + ulong y = convert_ulong(y); + ulong abs_x = abs(x); + ulong abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(ulong x) +{ + return convert_float(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(ulong x) +{ + float r = convert_float(x); + ulong y = convert_ulong(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(ulong x) +{ + float r = convert_float(x); + ulong y = convert_ulong(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(ulong2 x) +{ + float2 r = convert_float2(x); + ulong2 y = convert_ulong2(y); + ulong2 abs_x = abs(x); + ulong2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(ulong2 x) +{ + return convert_float2(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(ulong2 x) +{ + float2 r = convert_float2(x); + ulong2 y = convert_ulong2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(ulong2 x) +{ + float2 r = convert_float2(x); + ulong2 y = convert_ulong2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(ulong3 x) +{ + float3 r = convert_float3(x); + ulong3 y = convert_ulong3(y); + ulong3 abs_x = abs(x); + ulong3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(ulong3 x) +{ + return convert_float3(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(ulong3 x) +{ + float3 r = convert_float3(x); + ulong3 y = convert_ulong3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(ulong3 x) +{ + float3 r = convert_float3(x); + ulong3 y = convert_ulong3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(ulong4 x) +{ + float4 r = convert_float4(x); + ulong4 y = convert_ulong4(y); + ulong4 abs_x = abs(x); + ulong4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(ulong4 x) +{ + return convert_float4(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(ulong4 x) +{ + float4 r = convert_float4(x); + ulong4 y = convert_ulong4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(ulong4 x) +{ + float4 r = convert_float4(x); + ulong4 y = convert_ulong4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(ulong8 x) +{ + float8 r = convert_float8(x); + ulong8 y = convert_ulong8(y); + ulong8 abs_x = abs(x); + ulong8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(ulong8 x) +{ + return convert_float8(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(ulong8 x) +{ + float8 r = convert_float8(x); + ulong8 y = convert_ulong8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(ulong8 x) +{ + float8 r = convert_float8(x); + ulong8 y = convert_ulong8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(ulong16 x) +{ + float16 r = convert_float16(x); + ulong16 y = convert_ulong16(y); + ulong16 abs_x = abs(x); + ulong16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(ulong16 x) +{ + return convert_float16(x); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(ulong16 x) +{ + float16 r = convert_float16(x); + ulong16 y = convert_ulong16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +#endif +#ifdef cles_khr_int64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(ulong16 x) +{ + float16 r = convert_float16(x); + ulong16 y = convert_ulong16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(ulong x) +{ + double r = convert_double(x); + ulong y = convert_ulong(y); + ulong abs_x = abs(x); + ulong abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(ulong x) +{ + return convert_double(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(ulong x) +{ + double r = convert_double(x); + ulong y = convert_ulong(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(ulong x) +{ + double r = convert_double(x); + ulong y = convert_ulong(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(ulong2 x) +{ + double2 r = convert_double2(x); + ulong2 y = convert_ulong2(y); + ulong2 abs_x = abs(x); + ulong2 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(ulong2 x) +{ + return convert_double2(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(ulong2 x) +{ + double2 r = convert_double2(x); + ulong2 y = convert_ulong2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(ulong2 x) +{ + double2 r = convert_double2(x); + ulong2 y = convert_ulong2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(ulong3 x) +{ + double3 r = convert_double3(x); + ulong3 y = convert_ulong3(y); + ulong3 abs_x = abs(x); + ulong3 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(ulong3 x) +{ + return convert_double3(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(ulong3 x) +{ + double3 r = convert_double3(x); + ulong3 y = convert_ulong3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(ulong3 x) +{ + double3 r = convert_double3(x); + ulong3 y = convert_ulong3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(ulong4 x) +{ + double4 r = convert_double4(x); + ulong4 y = convert_ulong4(y); + ulong4 abs_x = abs(x); + ulong4 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(ulong4 x) +{ + return convert_double4(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(ulong4 x) +{ + double4 r = convert_double4(x); + ulong4 y = convert_ulong4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(ulong4 x) +{ + double4 r = convert_double4(x); + ulong4 y = convert_ulong4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(ulong8 x) +{ + double8 r = convert_double8(x); + ulong8 y = convert_ulong8(y); + ulong8 abs_x = abs(x); + ulong8 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(ulong8 x) +{ + return convert_double8(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(ulong8 x) +{ + double8 r = convert_double8(x); + ulong8 y = convert_ulong8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(ulong8 x) +{ + double8 r = convert_double8(x); + ulong8 y = convert_ulong8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(ulong16 x) +{ + double16 r = convert_double16(x); + ulong16 y = convert_ulong16(y); + ulong16 abs_x = abs(x); + ulong16 abs_y = abs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(ulong16 x) +{ + return convert_double16(x); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(ulong16 x) +{ + double16 r = convert_double16(x); + ulong16 y = convert_ulong16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#if defined(cl_khr_fp64) && defined(cles_khr_int64) +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(ulong16 x) +{ + double16 r = convert_double16(x); + ulong16 y = convert_ulong16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(float x) +{ + float r = convert_float(x); + float y = convert_float(y); + float abs_x = fabs(x); + float abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(float x) +{ + return convert_float(x); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(float x) +{ + float r = convert_float(x); + float y = convert_float(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(float x) +{ + float r = convert_float(x); + float y = convert_float(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(float2 x) +{ + float2 r = convert_float2(x); + float2 y = convert_float2(y); + float2 abs_x = fabs(x); + float2 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(float2 x) +{ + return convert_float2(x); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(float2 x) +{ + float2 r = convert_float2(x); + float2 y = convert_float2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(float2 x) +{ + float2 r = convert_float2(x); + float2 y = convert_float2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(float3 x) +{ + float3 r = convert_float3(x); + float3 y = convert_float3(y); + float3 abs_x = fabs(x); + float3 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(float3 x) +{ + return convert_float3(x); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(float3 x) +{ + float3 r = convert_float3(x); + float3 y = convert_float3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(float3 x) +{ + float3 r = convert_float3(x); + float3 y = convert_float3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(float4 x) +{ + float4 r = convert_float4(x); + float4 y = convert_float4(y); + float4 abs_x = fabs(x); + float4 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(float4 x) +{ + return convert_float4(x); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(float4 x) +{ + float4 r = convert_float4(x); + float4 y = convert_float4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(float4 x) +{ + float4 r = convert_float4(x); + float4 y = convert_float4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(float8 x) +{ + float8 r = convert_float8(x); + float8 y = convert_float8(y); + float8 abs_x = fabs(x); + float8 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(float8 x) +{ + return convert_float8(x); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(float8 x) +{ + float8 r = convert_float8(x); + float8 y = convert_float8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(float8 x) +{ + float8 r = convert_float8(x); + float8 y = convert_float8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(float16 x) +{ + float16 r = convert_float16(x); + float16 y = convert_float16(y); + float16 abs_x = fabs(x); + float16 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(float16 x) +{ + return convert_float16(x); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(float16 x) +{ + float16 r = convert_float16(x); + float16 y = convert_float16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(float16 x) +{ + float16 r = convert_float16(x); + float16 y = convert_float16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(float x) +{ + double r = convert_double(x); + float y = convert_float(y); + float abs_x = fabs(x); + float abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(float x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(float x) +{ + double r = convert_double(x); + float y = convert_float(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(float x) +{ + double r = convert_double(x); + float y = convert_float(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(float2 x) +{ + double2 r = convert_double2(x); + float2 y = convert_float2(y); + float2 abs_x = fabs(x); + float2 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(float2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(float2 x) +{ + double2 r = convert_double2(x); + float2 y = convert_float2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(float2 x) +{ + double2 r = convert_double2(x); + float2 y = convert_float2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(float3 x) +{ + double3 r = convert_double3(x); + float3 y = convert_float3(y); + float3 abs_x = fabs(x); + float3 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(float3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(float3 x) +{ + double3 r = convert_double3(x); + float3 y = convert_float3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(float3 x) +{ + double3 r = convert_double3(x); + float3 y = convert_float3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(float4 x) +{ + double4 r = convert_double4(x); + float4 y = convert_float4(y); + float4 abs_x = fabs(x); + float4 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(float4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(float4 x) +{ + double4 r = convert_double4(x); + float4 y = convert_float4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(float4 x) +{ + double4 r = convert_double4(x); + float4 y = convert_float4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(float8 x) +{ + double8 r = convert_double8(x); + float8 y = convert_float8(y); + float8 abs_x = fabs(x); + float8 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(float8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(float8 x) +{ + double8 r = convert_double8(x); + float8 y = convert_float8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(float8 x) +{ + double8 r = convert_double8(x); + float8 y = convert_float8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(float16 x) +{ + double16 r = convert_double16(x); + float16 y = convert_float16(y); + float16 abs_x = fabs(x); + float16 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(float16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(float16 x) +{ + double16 r = convert_double16(x); + float16 y = convert_float16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(float16 x) +{ + double16 r = convert_double16(x); + float16 y = convert_float16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtz(double x) +{ + float r = convert_float(x); + double y = convert_double(y); + double abs_x = fabs(x); + double abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rte(double x) +{ + return convert_float(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtp(double x) +{ + float r = convert_float(x); + double y = convert_double(y); + return select(r, nextafter(r, (float)INFINITY), convert_int(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float convert_float_rtn(double x) +{ + float r = convert_float(x); + double y = convert_double(y); + return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtz(double2 x) +{ + float2 r = convert_float2(x); + double2 y = convert_double2(y); + double2 abs_x = fabs(x); + double2 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rte(double2 x) +{ + return convert_float2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtp(double2 x) +{ + float2 r = convert_float2(x); + double2 y = convert_double2(y); + return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float2 convert_float2_rtn(double2 x) +{ + float2 r = convert_float2(x); + double2 y = convert_double2(y); + return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtz(double3 x) +{ + float3 r = convert_float3(x); + double3 y = convert_double3(y); + double3 abs_x = fabs(x); + double3 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rte(double3 x) +{ + return convert_float3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtp(double3 x) +{ + float3 r = convert_float3(x); + double3 y = convert_double3(y); + return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float3 convert_float3_rtn(double3 x) +{ + float3 r = convert_float3(x); + double3 y = convert_double3(y); + return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtz(double4 x) +{ + float4 r = convert_float4(x); + double4 y = convert_double4(y); + double4 abs_x = fabs(x); + double4 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rte(double4 x) +{ + return convert_float4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtp(double4 x) +{ + float4 r = convert_float4(x); + double4 y = convert_double4(y); + return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float4 convert_float4_rtn(double4 x) +{ + float4 r = convert_float4(x); + double4 y = convert_double4(y); + return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtz(double8 x) +{ + float8 r = convert_float8(x); + double8 y = convert_double8(y); + double8 abs_x = fabs(x); + double8 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rte(double8 x) +{ + return convert_float8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtp(double8 x) +{ + float8 r = convert_float8(x); + double8 y = convert_double8(y); + return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float8 convert_float8_rtn(double8 x) +{ + float8 r = convert_float8(x); + double8 y = convert_double8(y); + return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtz(double16 x) +{ + float16 r = convert_float16(x); + double16 y = convert_double16(y); + double16 abs_x = fabs(x); + double16 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rte(double16 x) +{ + return convert_float16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtp(double16 x) +{ + float16 r = convert_float16(x); + double16 y = convert_double16(y); + return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +float16 convert_float16_rtn(double16 x) +{ + float16 r = convert_float16(x); + double16 y = convert_double16(y); + return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtz(double x) +{ + double r = convert_double(x); + double y = convert_double(y); + double abs_x = fabs(x); + double abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rte(double x) +{ + return convert_double(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtp(double x) +{ + double r = convert_double(x); + double y = convert_double(y); + return select(r, nextafter(r, (double)INFINITY), convert_long(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double convert_double_rtn(double x) +{ + double r = convert_double(x); + double y = convert_double(y); + return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtz(double2 x) +{ + double2 r = convert_double2(x); + double2 y = convert_double2(y); + double2 abs_x = fabs(x); + double2 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rte(double2 x) +{ + return convert_double2(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtp(double2 x) +{ + double2 r = convert_double2(x); + double2 y = convert_double2(y); + return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double2 convert_double2_rtn(double2 x) +{ + double2 r = convert_double2(x); + double2 y = convert_double2(y); + return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtz(double3 x) +{ + double3 r = convert_double3(x); + double3 y = convert_double3(y); + double3 abs_x = fabs(x); + double3 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rte(double3 x) +{ + return convert_double3(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtp(double3 x) +{ + double3 r = convert_double3(x); + double3 y = convert_double3(y); + return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double3 convert_double3_rtn(double3 x) +{ + double3 r = convert_double3(x); + double3 y = convert_double3(y); + return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtz(double4 x) +{ + double4 r = convert_double4(x); + double4 y = convert_double4(y); + double4 abs_x = fabs(x); + double4 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rte(double4 x) +{ + return convert_double4(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtp(double4 x) +{ + double4 r = convert_double4(x); + double4 y = convert_double4(y); + return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double4 convert_double4_rtn(double4 x) +{ + double4 r = convert_double4(x); + double4 y = convert_double4(y); + return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtz(double8 x) +{ + double8 r = convert_double8(x); + double8 y = convert_double8(y); + double8 abs_x = fabs(x); + double8 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rte(double8 x) +{ + return convert_double8(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtp(double8 x) +{ + double8 r = convert_double8(x); + double8 y = convert_double8(y); + return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double8 convert_double8_rtn(double8 x) +{ + double8 r = convert_double8(x); + double8 y = convert_double8(y); + return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtz(double16 x) +{ + double16 r = convert_double16(x); + double16 y = convert_double16(y); + double16 abs_x = fabs(x); + double16 abs_y = fabs(y); + return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rte(double16 x) +{ + return convert_double16(x); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtp(double16 x) +{ + double16 r = convert_double16(x); + double16 y = convert_double16(y); + return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x)); +} +#endif +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD +double16 convert_double16_rtn(double16 x) +{ + double16 r = convert_double16(x); + double16 y = convert_double16(y); + return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x)); +} +#endif + + +#endif // ASW diff --git a/src/builtins/cross.cl b/src/builtins/cross.cl new file mode 100644 index 0000000..a3e019f --- /dev/null +++ b/src/builtins/cross.cl @@ -0,0 +1,59 @@ +/****************************************************************************** + * Copyright (c) 2011-2013, Peter Collingbourne <peter@pcc.me.uk> + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_OVERLOAD _CLC_DEF float3 cross(float3 p0, float3 p1) +{ + return (float3)(p0.y*p1.z - p0.z*p1.y, + p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x); +} + +_CLC_OVERLOAD _CLC_DEF float4 cross(float4 p0, float4 p1) +{ + return (float4)(p0.y*p1.z - p0.z*p1.y, + p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x, + 0.f); +} + +_CLC_OVERLOAD _CLC_DEF double3 cross(double3 p0, double3 p1) +{ + return (double3)(p0.y*p1.z - p0.z*p1.y, + p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x); +} + +_CLC_OVERLOAD _CLC_DEF double4 cross(double4 p0, double4 p1) +{ + return (double4)(p0.y*p1.z - p0.z*p1.y, + p0.z*p1.x - p0.x*p1.z, + p0.x*p1.y - p0.y*p1.x, + 0.); +} diff --git a/src/builtins/degrees.cl b/src/builtins/degrees.cl new file mode 100644 index 0000000..329e0f1 --- /dev/null +++ b/src/builtins/degrees.cl @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define EXPAND_SIZES(type) \ + IMPLEMENTATION(_VEC_TYPE(type,3)) \ + IMPLEMENTATION(_VEC_TYPE(type,4)) \ + IMPLEMENTATION(_VEC_TYPE(type,8)) \ + IMPLEMENTATION(_VEC_TYPE(type,16)) \ + +#define IMPLEMENTATION(gentype) \ +_CLC_OVERLOAD _CLC_DEF gentype degrees(gentype radians) { return radians * (gentype)180.0 * (gentype)M_1_PI; } \ +_CLC_OVERLOAD _CLC_DEF gentype radians(gentype degrees) { return degrees * (gentype)M_PI / (gentype)180.0; } + +EXPAND_SIZES(float) +EXPAND_SIZES(double) diff --git a/src/builtins/dot.cl b/src/builtins/dot.cl new file mode 100644 index 0000000..0b16d66 --- /dev/null +++ b/src/builtins/dot.cl @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_OVERLOAD _CLC_DEF float dot(float3 p0, float3 p1) +{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; } + +_CLC_OVERLOAD _CLC_DEF float dot(float4 p0, float4 p1) +{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; } + +_CLC_OVERLOAD _CLC_DEF double dot(double3 p0, double3 p1) +{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; } + +_CLC_OVERLOAD _CLC_DEF double dot(double4 p0, double4 p1) +{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; } + diff --git a/src/builtins/fract.cl b/src/builtins/fract.cl new file mode 100644 index 0000000..11f08e8 --- /dev/null +++ b/src/builtins/fract.cl @@ -0,0 +1,93 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define SCALAR(type, the_max) \ +{ \ + type the_floor = floor(x); \ + *ptr = the_floor; \ + if (isnan(x)) return x; \ + return fmin(x - the_floor, (type) (the_max)); \ +} \ + +#define BODY(type, the_max) \ +{ \ + type the_floor = floor(x); \ + *ptr = the_floor; \ + type result = fmin(x - the_floor, (type) (the_max)); \ + return select(result, x, isnan(x)); \ +} \ + +_CLC_OVERLOAD _CLC_DEF float fract(float x, global float * ptr) SCALAR(float, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float fract(float x, local float * ptr) SCALAR(float, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float fract(float x, private float * ptr) SCALAR(float, 0x1.fffffep-1f) + +_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, global float2 * ptr) BODY(float2, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, local float2 * ptr) BODY(float2, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, private float2 * ptr) BODY(float2, 0x1.fffffep-1f) + +_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, global float3 * ptr) BODY(float3, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, local float3 * ptr) BODY(float3, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, private float3 * ptr) BODY(float3, 0x1.fffffep-1f) + +_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, global float4 * ptr) BODY(float4, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, local float4 * ptr) BODY(float4, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, private float4 * ptr) BODY(float4, 0x1.fffffep-1f) + +_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, global float8 * ptr) BODY(float8, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, local float8 * ptr) BODY(float8, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, private float8 * ptr) BODY(float8, 0x1.fffffep-1f) + +_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, global float16 * ptr) BODY(float16, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, local float16 * ptr) BODY(float16, 0x1.fffffep-1f) +_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, private float16 * ptr) BODY(float16, 0x1.fffffep-1f) + +_CLC_OVERLOAD _CLC_DEF double fract(double x, global double * ptr) SCALAR(double, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double fract(double x, local double * ptr) SCALAR(double, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double fract(double x, private double * ptr) SCALAR(double, 0x1.fffffffffffffp-1) + +_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, global double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, local double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, private double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1) + +_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, global double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, local double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, private double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1) + +_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, global double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, local double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, private double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1) + +_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, global double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, local double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, private double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1) + +_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, global double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, local double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1) +_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, private double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1) + diff --git a/src/builtins/frexp.cl b/src/builtins/frexp.cl new file mode 100644 index 0000000..e02cf90 --- /dev/null +++ b/src/builtins/frexp.cl @@ -0,0 +1,76 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_OVERLOAD _CLC_DEF float frexp(float x, global int * ptr) SCALAR_BODY(float, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float frexp(float x, local int * ptr) SCALAR_BODY(float, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float frexp(float x, private int * ptr) SCALAR_BODY(float, __builtin_frexpf, int) + +_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, global int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, local int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, private int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int) + +_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, global int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, local int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, private int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int) + +_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, global int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, local int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, private int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int) + +_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, global int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, local int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, private int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int) + +_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, global int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, local int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int) +_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, private int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int) + +_CLC_OVERLOAD _CLC_DEF double frexp(double x, global int * ptr) SCALAR_BODY(double, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double frexp(double x, local int * ptr) SCALAR_BODY(double, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double frexp(double x, private int * ptr) SCALAR_BODY(double, __builtin_frexp, int) + +_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, global int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, local int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, private int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int) + +_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, global int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, local int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, private int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int) + +_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, global int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, local int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, private int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int) + +_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, global int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, local int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, private int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int) + +_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, global int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, local int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int) +_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, private int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int) diff --git a/src/builtins/hadd.cl b/src/builtins/hadd.cl new file mode 100644 index 0000000..c96324f --- /dev/null +++ b/src/builtins/hadd.cl @@ -0,0 +1,44 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk> + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define EXPAND_SIZES(type) \ + IMPLEMENTATION(_VEC_TYPE(type,2)) \ + IMPLEMENTATION(_VEC_TYPE(type,3)) \ + IMPLEMENTATION(_VEC_TYPE(type,4)) \ + IMPLEMENTATION(_VEC_TYPE(type,8)) \ + IMPLEMENTATION(_VEC_TYPE(type,16)) \ + +#define IMPLEMENTATION(gentype) \ + _CLC_OVERLOAD _CLC_DEF gentype hadd(gentype x, gentype y) \ + { return (x >> (gentype)1) + (y >> (gentype)1) + (x & y & (gentype)1); } \ + _CLC_OVERLOAD _CLC_DEF gentype rhadd(gentype x, gentype y) \ + { return (x >> (gentype)1) + (y >> (gentype)1) + ((x&(gentype)1)|(y&(gentype)1)); } \ + +_EXPAND_INTEGER_TYPES() diff --git a/src/builtins/length.cl b/src/builtins/length.cl new file mode 100644 index 0000000..2cfefa1 --- /dev/null +++ b/src/builtins/length.cl @@ -0,0 +1,109 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_OVERLOAD _CLC_DEF float length(float2 p) +{ + float r; + p = fabs(p); + if (p.x > p.y) + { + r = p.y/p.x; + return p.x * sqrt(1+r*r); + } + else if (p.y != 0) + { + r = p.x/p.y; + return p.y * sqrt(1+r*r); + } + return 0.0; +} + +_CLC_OVERLOAD _CLC_DEF double length(double2 p) +{ + double r; + p = fabs(p); + if (p.x > p.y) + { + r = p.y/p.x; + return p.x * sqrt(1+r*r); + } + else if (p.y != 0) + { + r = p.x/p.y; + return p.y * sqrt(1+r*r); + } + return 0.0; +} + +_CLC_OVERLOAD _CLC_DEF float length(float3 p) +{ + p = fabs(p); + float max_term = max(p.x, max(p.y, p.z)); + if (max_term == 0 || isinf(max_term) ) return max_term; + if (max_term < 1) return fast_length(p); + p /= max_term; + return max_term * sqrt(dot(p,p)); +} + +_CLC_OVERLOAD _CLC_DEF double length(double3 p) +{ + p = fabs(p); + double max_term = max(p.x, max(p.y, p.z)); + if (max_term == 0 || isinf(max_term) ) return max_term; + if (max_term < 1) return fast_length(p); + p /= max_term; + return max_term * sqrt(dot(p,p)); +} + +_CLC_OVERLOAD _CLC_DEF float length(float4 p) +{ + p = fabs(p); + float max_term = max(max(p.x, p.y), max(p.z, p.w)); + if (max_term == 0 || isinf(max_term) ) return max_term; + if (max_term < 1) return fast_length(p); + p /= max_term; + return max_term * sqrt(dot(p,p)); +} + +_CLC_OVERLOAD _CLC_DEF double length(double4 p) +{ + p = fabs(p); + double max_term = max(max(p.x, p.y), max(p.z, p.w)); + if (max_term == 0 || isinf(max_term) ) return max_term; + if (max_term < 1) return fast_length(p); + p /= max_term; + return max_term * sqrt(dot(p,p)); +} + +_CLC_OVERLOAD _CLC_DEF float fast_length(float2 p) { return sqrt(dot(p,p));} +_CLC_OVERLOAD _CLC_DEF float fast_length(float3 p) { return sqrt(dot(p,p));} +_CLC_OVERLOAD _CLC_DEF float fast_length(float4 p) { return sqrt(dot(p,p));} +_CLC_OVERLOAD _CLC_DEF double fast_length(double2 p) { return sqrt(dot(p,p));} +_CLC_OVERLOAD _CLC_DEF double fast_length(double3 p) { return sqrt(dot(p,p));} +_CLC_OVERLOAD _CLC_DEF double fast_length(double4 p) { return sqrt(dot(p,p));} diff --git a/src/builtins/lgamma_r.cl b/src/builtins/lgamma_r.cl new file mode 100644 index 0000000..aa3d487 --- /dev/null +++ b/src/builtins/lgamma_r.cl @@ -0,0 +1,80 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_PROTECTED float lgammaf_r(float x, int * ptr); +_CLC_PROTECTED double builtin_lgamma_r(double x, int * ptr); + +_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, global int * ptr) SCALAR_BODY(float, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, local int * ptr) SCALAR_BODY(float, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, private int * ptr) SCALAR_BODY(float, lgammaf_r, int) + +_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, global int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, local int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, private int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int) + +_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, global int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, local int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, private int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int) + +_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, global int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, local int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, private int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int) + +_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, global int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, local int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, private int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int) + +_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, global int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, local int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int) +_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, private int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int) + +_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, global int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, local int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, private int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int) + +_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, global int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, local int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, private int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int) + +_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, global int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, local int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, private int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int) + +_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, global int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, local int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, private int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int) + +_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, global int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, local int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, private int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int) + +_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, global int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, local int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int) +_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, private int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int) + diff --git a/src/builtins/mad_sat.cl b/src/builtins/mad_sat.cl new file mode 100644 index 0000000..ac79a86 --- /dev/null +++ b/src/builtins/mad_sat.cl @@ -0,0 +1,37 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +TERNARY_VEC_DEF(char, char, mad_sat, mad_sat) +TERNARY_VEC_DEF(uchar, uchar, mad_sat, mad_sat) +TERNARY_VEC_DEF(short, short, mad_sat, mad_sat) +TERNARY_VEC_DEF(ushort, ushort,mad_sat, mad_sat) +TERNARY_VEC_DEF(int, int, mad_sat, mad_sat) +TERNARY_VEC_DEF(uint, uint, mad_sat, mad_sat) +TERNARY_VEC_DEF(long, long, mad_sat, mad_sat) +TERNARY_VEC_DEF(ulong, ulong, mad_sat, mad_sat) diff --git a/src/builtins/math.cl b/src/builtins/math.cl new file mode 100644 index 0000000..02db08b --- /dev/null +++ b/src/builtins/math.cl @@ -0,0 +1,151 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define UNARY(function) \ +_CLC_PROTECTED _CLC_INLINE float function##f(float x) { return (float)__builtin_##function(x); } \ +_CLC_PROTECTED _CLC_INLINE double function##d(double x) { return __builtin_##function(x); } \ +UNARY_VEC_DEF(float, float, function, function##f) \ +UNARY_VEC_DEF(double, double, function, function##d) \ + +#define UNARY_ALT(utype, function) \ +_CLC_PROTECTED _CLC_INLINE utype function##f(float x) { return __builtin_##function(x); } \ +_CLC_PROTECTED _CLC_INLINE utype function##d(double x) { return __builtin_##function(x); } \ +UNARY_VEC_DEF(float, utype, function, function##f) \ +UNARY_VEC_DEF(double, utype, function, function##d) \ + +#define UNARY_NO_BUILTIN(function) \ +UNARY_VEC_DEF(float, float, function, function) \ +UNARY_VEC_DEF(double, double, function, function) \ + +#define BINARY(function) \ +_CLC_PROTECTED _CLC_INLINE float function##f(float x, float y) { return (float)__builtin_##function(x,y); } \ +_CLC_PROTECTED _CLC_INLINE double function##d(double x, double y) { return __builtin_##function(x,y); } \ +BINARY_VEC_DEF(float, float, function, function) \ +BINARY_VEC_DEF(double, double, function, function) \ + +#define BINARY_NO_BUILTIN(function) \ +BINARY_VEC_DEF(float, float, function, function) \ +BINARY_VEC_DEF(double, double, function, function) \ + +#define TERNARY(function) \ +_CLC_PROTECTED _CLC_INLINE float function##f(float x, float y, float z) { return (float)__builtin_##function(x,y,z); } \ +_CLC_PROTECTED _CLC_INLINE double function##d(double x, double y, double z) { return __builtin_##function(x,y,z); } \ +TERNARY_VEC_DEF(float, float, function, function) \ +TERNARY_VEC_DEF(double, double, function, function) \ + +#define TERNARY_NO_BUILTIN(function) \ +TERNARY_VEC_DEF(float, float, function, function) \ +TERNARY_VEC_DEF(double, double, function, function) \ + +/*------------------------------------------------------------------------- +* Prototypes for the math builtins +*------------------------------------------------------------------------*/ +UNARY(acos) +UNARY(acosh) +UNARY_NO_BUILTIN(acospi) +UNARY(asin) +UNARY(asinh) +UNARY_NO_BUILTIN(asinpi) +UNARY(atan) +BINARY_NO_BUILTIN(atan2pi) +UNARY(atanh) +UNARY_NO_BUILTIN(atanpi) +BINARY(atan2) +UNARY(cbrt) +UNARY(ceil) +UNARY(cos) +BINARY(copysign) +UNARY(cosh) +UNARY_NO_BUILTIN(cospi) +UNARY(erf) +UNARY(erfc) +UNARY(exp) +UNARY(exp2) +UNARY_NO_BUILTIN(exp10) +UNARY(expm1) +UNARY(fabs) +BINARY(fdim) +UNARY(floor) +TERNARY(fma) +BINARY(fmax) +BINARY(fmin) +BINARY(fmod) +BINARY(hypot) + +UNARY_ALT(int, ilogb) + +BINARY_VEC_DEF_ALT(float, float, int, ldexp, ldexpf) +BINARY_VEC_DEF_ALT(double, double, int, ldexp, ldexp) + +UNARY(lgamma) +UNARY(log) +UNARY(log2) +UNARY(log10) +UNARY(log1p) +UNARY(logb) +TERNARY_NO_BUILTIN(mad) +BINARY_NO_BUILTIN(maxmag) +BINARY_NO_BUILTIN(minmag) + +UNARY_VEC_DEF(uint, float, nan, nan) +UNARY_VEC_DEF(ulong, double, nan, nan) + +BINARY(nextafter) +BINARY(pow) + +BINARY_VEC_DEF_ALT(float, float, int, pown, powf) +BINARY_VEC_DEF_ALT(double, double, int, pown, builtin_pow) + +BINARY_NO_BUILTIN(powr) +BINARY(remainder) +UNARY(rint) + +BINARY_VEC_DEF_ALT(float, float, int, rootn, builtin_rootnf) +BINARY_VEC_DEF_ALT(double, double, int, rootn, builtin_rootn) + +UNARY(round) +UNARY_NO_BUILTIN(rsqrt) +UNARY(sin) +UNARY(sinh) +UNARY_NO_BUILTIN(sinpi) +UNARY(sqrt) +UNARY(tan) +UNARY(tanh) +UNARY_NO_BUILTIN(tanpi) +UNARY(tgamma) +UNARY(trunc) + +/*------------------------------------------------------------------------- +* Half functions: +*------------------------------------------------------------------------*/ + +BINARY_NO_BUILTIN(half_divide) +UNARY_NO_BUILTIN(half_recip) + + diff --git a/src/builtins/max.cl b/src/builtins/max.cl new file mode 100644 index 0000000..9605490 --- /dev/null +++ b/src/builtins/max.cl @@ -0,0 +1,46 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define EXPAND_SIZES(type) \ + IMPLEMENTATION(_VEC_TYPE(type,3), type) \ + IMPLEMENTATION(_VEC_TYPE(type,4), type) \ + IMPLEMENTATION(_VEC_TYPE(type,8), type) \ + IMPLEMENTATION(_VEC_TYPE(type,16), type) \ + +#define IMPLEMENTATION(gentype, sgentype) \ +_CLC_OVERLOAD _CLC_DEF gentype min(gentype x, gentype y) \ + { return y < x ? y : x; } \ +_CLC_OVERLOAD _CLC_DEF gentype min(gentype x, sgentype y) \ + { return (gentype)y < x ? (gentype)y : x; } \ +_CLC_OVERLOAD _CLC_DEF gentype max(gentype x, gentype y) \ + { return y > x ? y : x; } \ +_CLC_OVERLOAD _CLC_DEF gentype max(gentype x, sgentype y) \ + { return (gentype)y > x ? (gentype)y : x; } \ + +_EXPAND_TYPES() diff --git a/src/builtins/misc.cl b/src/builtins/misc.cl new file mode 100644 index 0000000..aba5efa --- /dev/null +++ b/src/builtins/misc.cl @@ -0,0 +1,36 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk> + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +_CLC_PROTECTED void __mfence(void); +//_CLC_PROTECTED void barrier(cl_mem_fence_flags flags) { } +_CLC_PROTECTED void mem_fence(cl_mem_fence_flags flags) { __mfence(); } +_CLC_PROTECTED void read_mem_fence(cl_mem_fence_flags flags) { __mfence(); } +_CLC_PROTECTED void write_mem_fence(cl_mem_fence_flags flags) { __mfence(); } + diff --git a/src/builtins/mix.cl b/src/builtins/mix.cl new file mode 100644 index 0000000..9f339aa --- /dev/null +++ b/src/builtins/mix.cl @@ -0,0 +1,42 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define EXPAND_SIZES(type) \ + IMPLEMENTATION (_VEC_TYPE(type,3), type) \ + IMPLEMENTATION (_VEC_TYPE(type,4), type) \ + IMPLEMENTATION (_VEC_TYPE(type,8), type) \ + IMPLEMENTATION (_VEC_TYPE(type,16), type) \ + +#define IMPLEMENTATION(gentype, sgentype) \ +_CLC_OVERLOAD _CLC_DEF gentype mix(gentype x, gentype y, gentype a) \ + { return x + (y-x) * a; } \ +_CLC_OVERLOAD _CLC_DEF gentype mix(gentype x, gentype y, sgentype a) \ + { return x + (y-x) * (gentype)a; } \ + +_EXPAND_TYPES() diff --git a/src/builtins/modf.cl b/src/builtins/modf.cl new file mode 100644 index 0000000..cf0aae7 --- /dev/null +++ b/src/builtins/modf.cl @@ -0,0 +1,81 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_PROTECTED float modff(float x, float * iptr); +_CLC_PROTECTED double builtin_modf(double x, double * iptr); + + +_CLC_OVERLOAD _CLC_DEF float modf(float x, global float * ptr) SCALAR_BODY(float, modff, float) +_CLC_OVERLOAD _CLC_DEF float modf(float x, local float * ptr) SCALAR_BODY(float, modff, float) +_CLC_OVERLOAD _CLC_DEF float modf(float x, private float * ptr) SCALAR_BODY(float, modff, float) + +_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, global float2 * ptr) VECTOR_BODY(float, 2, modff, float) +_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, local float2 * ptr) VECTOR_BODY(float, 2, modff, float) +_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, private float2 * ptr) VECTOR_BODY(float, 2, modff, float) + +_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, global float3 * ptr) VECTOR_BODY(float, 3, modff, float) +_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, local float3 * ptr) VECTOR_BODY(float, 3, modff, float) +_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, private float3 * ptr) VECTOR_BODY(float, 3, modff, float) + +_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, global float4 * ptr) VECTOR_BODY(float, 4, modff, float) +_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, local float4 * ptr) VECTOR_BODY(float, 4, modff, float) +_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, private float4 * ptr) VECTOR_BODY(float, 4, modff, float) + +_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, global float8 * ptr) VECTOR_BODY(float, 8, modff, float) +_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, local float8 * ptr) VECTOR_BODY(float, 8, modff, float) +_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, private float8 * ptr) VECTOR_BODY(float, 8, modff, float) + +_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, global float16 * ptr) VECTOR_BODY(float, 16, modff, float) +_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, local float16 * ptr) VECTOR_BODY(float, 16, modff, float) +_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, private float16 * ptr) VECTOR_BODY(float, 16, modff, float) + +_CLC_OVERLOAD _CLC_DEF double modf(double x, global double * ptr) SCALAR_BODY(double, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double modf(double x, local double * ptr) SCALAR_BODY(double, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double modf(double x, private double * ptr) SCALAR_BODY(double, builtin_modf, double) + +_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, global double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, local double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, private double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double) + +_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, global double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, local double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, private double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double) + +_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, global double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, local double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, private double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double) + +_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, global double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, local double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, private double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double) + +_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, global double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, local double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double) +_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, private double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double) + diff --git a/src/builtins/mul_hi.cl b/src/builtins/mul_hi.cl new file mode 100644 index 0000000..5b3368e --- /dev/null +++ b/src/builtins/mul_hi.cl @@ -0,0 +1,102 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk> + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +//FOIL-based long mul_hi +// +// Summary: Treat mul_hi(long x, long y) as: +// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively +// and b and d are the low-order parts of x and y. +// Thinking back to algebra, we use FOIL to do the work. + +_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){ + long f, o, i; + ulong l; + + //Move the high/low halves of x/y into the lower 32-bits of variables so + //that we can multiply them without worrying about overflow. + long x_hi = x >> 32; + long x_lo = x & UINT_MAX; + long y_hi = y >> 32; + long y_lo = y & UINT_MAX; + + //Multiply all of the components according to FOIL method + f = x_hi * y_hi; + o = x_hi * y_lo; + i = x_lo * y_hi; + l = x_lo * y_lo; + + //Now add the components back together in the following steps: + //F: doesn't need to be modified + //O/I: Need to be added together. + //L: Shift right by 32-bits, then add into the sum of O and I + //Once O/I/L are summed up, then shift the sum by 32-bits and add to F. + // + //We use hadd to give us a bit of extra precision for the intermediate sums + //but as a result, we shift by 31 bits instead of 32 + return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31)); +} + +_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y) +{ + ulong f, o, i; + ulong l; + + //Move the high/low halves of x/y into the lower 32-bits of variables so + //that we can multiply them without worrying about overflow. + ulong x_hi = x >> 32; + ulong x_lo = x & UINT_MAX; + ulong y_hi = y >> 32; + ulong y_lo = y & UINT_MAX; + + //Multiply all of the components according to FOIL method + f = x_hi * y_hi; + o = x_hi * y_lo; + i = x_lo * y_hi; + l = x_lo * y_lo; + + //Now add the components back together, taking care to respect the fact that: + //F: doesn't need to be modified + //O/I: Need to be added together. + //L: Shift right by 32-bits, then add into the sum of O and I + //Once O/I/L are summed up, then shift the sum by 32-bits and add to F. + // + //We use hadd to give us a bit of extra precision for the intermediate sums + //but as a result, we shift by 31 bits instead of 32 + return (f + (hadd(o, (i + (l>>32))) >> 31)); +} + +BINARY_VEC_DEF(char, char, mul_hi, mul_hi) +BINARY_VEC_DEF(uchar, uchar, mul_hi, mul_hi) +BINARY_VEC_DEF(short, short, mul_hi, mul_hi) +BINARY_VEC_DEF(ushort, ushort,mul_hi, mul_hi) +BINARY_VEC_DEF(int, int, mul_hi, mul_hi) +BINARY_VEC_DEF(uint, uint, mul_hi, mul_hi) +BINARY_VEC_DEF(long, long, mul_hi, mul_hi) +BINARY_VEC_DEF(ulong, ulong, mul_hi, mul_hi) diff --git a/src/builtins/relationals.cl b/src/builtins/relationals.cl new file mode 100644 index 0000000..a1d6830 --- /dev/null +++ b/src/builtins/relationals.cl @@ -0,0 +1,64 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +UNARY_VEC_DEF(float, int, isnan, -isnan) +UNARY_VEC_DEF(double, long, isnan, -isnan) + +UNARY_VEC_DEF(float, int, isfinite, -isfinite) +UNARY_VEC_DEF(double, long, isfinite, -isfinite) + +UNARY_VEC_DEF(float, int, isinf, -isinf) +UNARY_VEC_DEF(double, long, isinf, -isinf) + +UNARY_VEC_DEF(float, int, isnormal, -isnormal) +UNARY_VEC_DEF(double, long, isnormal, -isnormal) + +UNARY_VEC_DEF(float, int, signbit, -signbit) +UNARY_VEC_DEF(double, long, signbit, -signbit) + +BINARY_VEC_DEF(float, int, isequal, -isequal) +BINARY_VEC_DEF(double, long, isequal, -isequal) + +BINARY_VEC_DEF(float, int, isnotequal, -isnotequal) +BINARY_VEC_DEF(double, long, isnotequal, -isnotequal) + +BINARY_VEC_DEF(float, int, isless, -isless) +BINARY_VEC_DEF(double, long, isless, -isless) + +BINARY_VEC_DEF(float, int, islessequal, -islessequal) +BINARY_VEC_DEF(double, long, islessequal, -islessequal) + +BINARY_VEC_DEF(float, int, isgreater, -isgreater) +BINARY_VEC_DEF(double, long, isgreater, -isgreater) + +BINARY_VEC_DEF(float, int, isgreaterequal, -isgreaterequal) +BINARY_VEC_DEF(double, long, isgreaterequal, -isgreaterequal) + +BINARY_VEC_DEF(float, int, islessgreater, -islessgreater) +BINARY_VEC_DEF(double, long, islessgreater, -islessgreater) diff --git a/src/builtins/remquo.cl b/src/builtins/remquo.cl new file mode 100644 index 0000000..1bc5094 --- /dev/null +++ b/src/builtins/remquo.cl @@ -0,0 +1,127 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define REMQUO_SCALAR_BODY(type, op, ptr_type) \ +{ \ + ptr_type temp; \ + type result = op(x, y, &temp); \ + *ptr = temp; \ + return result; \ +} \ + +#define REMQUO_VECTOR_BODY_2(op, ptr_type) \ + temp.s0 = op(x.s0 ,y.s0, &(((ptr_type*)&itemp)[0])); \ + temp.s1 = op(x.s1 ,y.s1, &(((ptr_type*)&itemp)[1])); \ + +#define REMQUO_VECTOR_BODY_3(op, ptr_type) \ + REMQUO_VECTOR_BODY_2(op, ptr_type) \ + temp.s2 = op(x.s2 ,y.s2, &(((ptr_type*)&itemp)[2])); \ + +#define REMQUO_VECTOR_BODY_4(op, ptr_type) \ + REMQUO_VECTOR_BODY_3(op, ptr_type) \ + temp.s3 = op(x.s3 ,y.s3, &(((ptr_type*)&itemp)[3])); \ + +#define REMQUO_VECTOR_BODY_8(op, ptr_type) \ + REMQUO_VECTOR_BODY_4(op, ptr_type) \ + temp.s4 = op(x.s4 ,y.s4, &(((ptr_type*)&itemp)[4])); \ + temp.s5 = op(x.s5 ,y.s5, &(((ptr_type*)&itemp)[5])); \ + temp.s6 = op(x.s6 ,y.s6, &(((ptr_type*)&itemp)[6])); \ + temp.s7 = op(x.s7 ,y.s7, &(((ptr_type*)&itemp)[7])); \ + +#define REMQUO_VECTOR_BODY_16(op, ptr_type) \ + REMQUO_VECTOR_BODY_8(op, ptr_type) \ + temp.s8 = op(x.s8 ,y.s8, &(((ptr_type*)&itemp)[8])); \ + temp.s9 = op(x.s9 ,y.s9, &(((ptr_type*)&itemp)[9])); \ + temp.sa = op(x.sa ,y.sa, &(((ptr_type*)&itemp)[10])); \ + temp.sb = op(x.sb ,y.sb, &(((ptr_type*)&itemp)[11])); \ + temp.sc = op(x.sc ,y.sc, &(((ptr_type*)&itemp)[12])); \ + temp.sd = op(x.sd ,y.sd, &(((ptr_type*)&itemp)[13])); \ + temp.se = op(x.se ,y.se, &(((ptr_type*)&itemp)[14])); \ + temp.sf = op(x.sf ,y.sf, &(((ptr_type*)&itemp)[15])); \ + +#define REMQUO_VECTOR_BODY(prim_type, num, op, ptr_type) \ +{ \ + prim_type##num temp; \ + ptr_type##num itemp; \ + REMQUO_VECTOR_BODY_##num(op, ptr_type)\ + *ptr = itemp; \ + return temp; \ +} \ + + +_CLC_PROTECTED float remquof(float x, float y, int * ptr); +_CLC_PROTECTED double builtin_remquo(double x, double y, int * ptr); + +_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, global int * ptr) REMQUO_SCALAR_BODY(float, remquof, int) +_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, local int * ptr) REMQUO_SCALAR_BODY(float, remquof, int) +_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, private int * ptr) REMQUO_SCALAR_BODY(float, remquof, int) + +_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, global int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int) +_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, local int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int) +_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, private int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int) + +_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, global int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int) +_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, local int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int) +_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, private int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int) + +_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, global int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int) +_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, local int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int) +_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, private int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int) + +_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, global int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int) +_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, local int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int) +_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, private int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int) + +_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, global int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int) +_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, local int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int) +_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, private int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int) + +_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, global int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, local int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, private int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int) + +_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, global int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, local int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, private int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int) + +_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, global int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, local int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, private int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int) + +_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, global int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, local int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, private int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int) + +_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, global int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, local int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, private int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int) + +_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, global int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, local int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int) +_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, private int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int) diff --git a/src/builtins/rotate.cl b/src/builtins/rotate.cl new file mode 100644 index 0000000..fc894b0 --- /dev/null +++ b/src/builtins/rotate.cl @@ -0,0 +1,58 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +/*----------------------------------------------------------------------------- +* The template for non rotl applicable scalar types +*----------------------------------------------------------------------------*/ +#define SCALAR(type, utype) \ +_CLC_OVERLOAD _CLC_DEF type rotate(type v, type i) \ +{\ + uint bits = sizeof(v) << 3;\ + uint mask = bits - 1; \ + i &= mask; \ + if (i == 0) return v; \ + return (v << i) | ((utype)v >> (bits-i)); \ +}\ + +SCALAR(uchar, uchar) +SCALAR(char, uchar) +SCALAR(ushort, ushort) +SCALAR(short, ushort) +SCALAR(ulong, ulong) +SCALAR(long, ulong) +SCALAR(int, uint) + +BINARY_VEC_DEF(char, char, rotate, rotate) +BINARY_VEC_DEF(uchar, uchar, rotate, rotate) +BINARY_VEC_DEF(short, short, rotate, rotate) +BINARY_VEC_DEF(ushort, ushort,rotate, rotate) +BINARY_VEC_DEF(int, int, rotate, rotate) +BINARY_VEC_DEF(uint, uint, rotate, rotate) +BINARY_VEC_DEF(long, long, rotate, rotate) +BINARY_VEC_DEF(ulong, ulong, rotate, rotate) diff --git a/src/builtins/select.cl b/src/builtins/select.cl new file mode 100644 index 0000000..52a078c --- /dev/null +++ b/src/builtins/select.cl @@ -0,0 +1,53 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define DECLARATION(type, itype, utype) \ +_CLC_OVERLOAD _CLC_DEF type select(type a, type b, itype c) { return c ? b : a; }\ +_CLC_OVERLOAD _CLC_DEF type select(type a, type b, utype c) { return c ? b : a; } + +#define SELECT_EXPAND_SIZES(type,itype,utype) \ + DECLARATION(_VEC_TYPE(type,2), _VEC_TYPE(itype,2), _VEC_TYPE(utype,2)) \ + DECLARATION(_VEC_TYPE(type,3), _VEC_TYPE(itype,3), _VEC_TYPE(utype,3)) \ + DECLARATION(_VEC_TYPE(type,4), _VEC_TYPE(itype,4), _VEC_TYPE(utype,4)) \ + DECLARATION(_VEC_TYPE(type,8), _VEC_TYPE(itype,8), _VEC_TYPE(utype,8)) \ + DECLARATION(_VEC_TYPE(type,16), _VEC_TYPE(itype,16), _VEC_TYPE(utype,16)) \ + +#define SELECT_EXPAND_TYPES \ + SELECT_EXPAND_SIZES(char, char, uchar) \ + SELECT_EXPAND_SIZES(uchar, char, uchar) \ + SELECT_EXPAND_SIZES(short, short, ushort) \ + SELECT_EXPAND_SIZES(ushort, short, ushort) \ + SELECT_EXPAND_SIZES(int, int, uint) \ + SELECT_EXPAND_SIZES(uint, int, uint) \ + SELECT_EXPAND_SIZES(long, long, ulong) \ + SELECT_EXPAND_SIZES(ulong, long, ulong) \ + SELECT_EXPAND_SIZES(float, int, uint) \ + SELECT_EXPAND_SIZES(double, long, ulong) + +SELECT_EXPAND_TYPES diff --git a/src/builtins/shuffle.cl b/src/builtins/shuffle.cl new file mode 100644 index 0000000..3ec3b56 --- /dev/null +++ b/src/builtins/shuffle.cl @@ -0,0 +1,215 @@ +/****************************************************************************** + * Copyright (c) 2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "clc.h" + +#define TEMPLATE2(res_elemt, val_vnum, mask_elemt) \ +_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle(res_elemt##val_vnum val, mask_elemt##2 mask) \ +{ \ + res_elemt##2 result; \ + res_elemt *p = (res_elemt*)&val; \ + result.s0 = p[mask.s0 & vec_step(val)-1]; \ + result.s1 = p[mask.s1 & vec_step(val)-1]; \ + return result; \ +}\ +_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##2 mask) \ +{ \ + res_elemt##2 result; \ + res_elemt *p1 = (res_elemt*)&val1; \ + res_elemt *p2 = (res_elemt*)&val2; \ + result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \ + p1[mask.s0 & vec_step(val1)-1]; \ + result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \ + p1[mask.s1 & vec_step(val1)-1]; \ + return result; \ +} + +#define TEMPLATE4(res_elemt, val_vnum, mask_elemt) \ +_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle(res_elemt##val_vnum val, mask_elemt##4 mask) \ +{ \ + res_elemt##4 result; \ + res_elemt *p = (res_elemt*)&val; \ + result.s0 = p[mask.s0 & vec_step(val)-1]; \ + result.s1 = p[mask.s1 & vec_step(val)-1]; \ + result.s2 = p[mask.s2 & vec_step(val)-1]; \ + result.s3 = p[mask.s3 & vec_step(val)-1]; \ + return result; \ +} \ +_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##4 mask) \ +{ \ + res_elemt##4 result; \ + res_elemt *p1= (res_elemt*)&val1; \ + res_elemt *p2 = (res_elemt*)&val2; \ + result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \ + p1[mask.s0 & vec_step(val1)-1]; \ + result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \ + p1[mask.s1 & vec_step(val1)-1]; \ + result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \ + p1[mask.s2 & vec_step(val1)-1]; \ + result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \ + p1[mask.s3 & vec_step(val1)-1]; \ + return result; \ +} + +#define TEMPLATE8(res_elemt, val_vnum, mask_elemt) \ +_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle(res_elemt##val_vnum val, mask_elemt##8 mask) \ +{ \ + res_elemt##8 result; \ + res_elemt *p = (res_elemt*)&val; \ + result.s0 = p[mask.s0 & vec_step(val)-1]; \ + result.s1 = p[mask.s1 & vec_step(val)-1]; \ + result.s2 = p[mask.s2 & vec_step(val)-1]; \ + result.s3 = p[mask.s3 & vec_step(val)-1]; \ + result.s4 = p[mask.s4 & vec_step(val)-1]; \ + result.s5 = p[mask.s5 & vec_step(val)-1]; \ + result.s6 = p[mask.s6 & vec_step(val)-1]; \ + result.s7 = p[mask.s7 & vec_step(val)-1]; \ + return result; \ +} \ +_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##8 mask) \ +{ \ + res_elemt##8 result; \ + res_elemt *p1= (res_elemt*)&val1; \ + res_elemt *p2 = (res_elemt*)&val2; \ + result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \ + p1[mask.s0 & vec_step(val1)-1]; \ + result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \ + p1[mask.s1 & vec_step(val1)-1]; \ + result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \ + p1[mask.s2 & vec_step(val1)-1]; \ + result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \ + p1[mask.s3 & vec_step(val1)-1]; \ + result.s4 = mask.s4 & vec_step(val1) ? p2[mask.s4 & vec_step(val1)-1] : \ + p1[mask.s4 & vec_step(val1)-1]; \ + result.s5 = mask.s5 & vec_step(val1) ? p2[mask.s5 & vec_step(val1)-1] : \ + p1[mask.s5 & vec_step(val1)-1]; \ + result.s6 = mask.s6 & vec_step(val1) ? p2[mask.s6 & vec_step(val1)-1] : \ + p1[mask.s6 & vec_step(val1)-1]; \ + result.s7 = mask.s7 & vec_step(val1) ? p2[mask.s7 & vec_step(val1)-1] : \ + p1[mask.s7 & vec_step(val1)-1]; \ + return result; \ +} + +#define TEMPLATE16(res_elemt, val_vnum, mask_elemt) \ +_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle(res_elemt##val_vnum val, mask_elemt##16 mask) \ +{ \ + res_elemt##16 result; \ + res_elemt *p = (res_elemt*)&val; \ + result.s0 = p[mask.s0 & vec_step(val)-1]; \ + result.s1 = p[mask.s1 & vec_step(val)-1]; \ + result.s2 = p[mask.s2 & vec_step(val)-1]; \ + result.s3 = p[mask.s3 & vec_step(val)-1]; \ + result.s4 = p[mask.s4 & vec_step(val)-1]; \ + result.s5 = p[mask.s5 & vec_step(val)-1]; \ + result.s6 = p[mask.s6 & vec_step(val)-1]; \ + result.s7 = p[mask.s7 & vec_step(val)-1]; \ + result.s8 = p[mask.s8 & vec_step(val)-1]; \ + result.s9 = p[mask.s9 & vec_step(val)-1]; \ + result.sa = p[mask.sa & vec_step(val)-1]; \ + result.sb = p[mask.sb & vec_step(val)-1]; \ + result.sc = p[mask.sc & vec_step(val)-1]; \ + result.sd = p[mask.sd & vec_step(val)-1]; \ + result.se = p[mask.se & vec_step(val)-1]; \ + result.sf = p[mask.sf & vec_step(val)-1]; \ + return result; \ +} \ +_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##16 mask) \ +{ \ + res_elemt##16 result; \ + res_elemt *p1= (res_elemt*)&val1; \ + res_elemt *p2 = (res_elemt*)&val2; \ + result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \ + p1[mask.s0 & vec_step(val1)-1]; \ + result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \ + p1[mask.s1 & vec_step(val1)-1]; \ + result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \ + p1[mask.s2 & vec_step(val1)-1]; \ + result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \ + p1[mask.s3 & vec_step(val1)-1]; \ + result.s4 = mask.s4 & vec_step(val1) ? p2[mask.s4 & vec_step(val1)-1] : \ + p1[mask.s4 & vec_step(val1)-1]; \ + result.s5 = mask.s5 & vec_step(val1) ? p2[mask.s5 & vec_step(val1)-1] : \ + p1[mask.s5 & vec_step(val1)-1]; \ + result.s6 = mask.s6 & vec_step(val1) ? p2[mask.s6 & vec_step(val1)-1] : \ + p1[mask.s6 & vec_step(val1)-1]; \ + result.s7 = mask.s7 & vec_step(val1) ? p2[mask.s7 & vec_step(val1)-1] : \ + p1[mask.s7 & vec_step(val1)-1]; \ + result.s8 = mask.s8 & vec_step(val1) ? p2[mask.s8 & vec_step(val1)-1] : \ + p1[mask.s8 & vec_step(val1)-1]; \ + result.s9 = mask.s9 & vec_step(val1) ? p2[mask.s9 & vec_step(val1)-1] : \ + p1[mask.s9 & vec_step(val1)-1]; \ + result.sa = mask.sa & vec_step(val1) ? p2[mask.sa & vec_step(val1)-1] : \ + p1[mask.sa & vec_step(val1)-1]; \ + result.sb = mask.sb & vec_step(val1) ? p2[mask.sb & vec_step(val1)-1] : \ + p1[mask.sb & vec_step(val1)-1]; \ + result.sc = mask.sc & vec_step(val1) ? p2[mask.sc & vec_step(val1)-1] : \ + p1[mask.sc & vec_step(val1)-1]; \ + result.sd = mask.sd & vec_step(val1) ? p2[mask.sd & vec_step(val1)-1] : \ + p1[mask.sd & vec_step(val1)-1]; \ + result.se = mask.se & vec_step(val1) ? p2[mask.se & vec_step(val1)-1] : \ + p1[mask.se & vec_step(val1)-1]; \ + result.sf = mask.sf & vec_step(val1) ? p2[mask.sf & vec_step(val1)-1] : \ + p1[mask.sf & vec_step(val1)-1]; \ + return result; \ +} + + +#define CROSS_SIZE(type1, type2) \ +TEMPLATE2(type1, 2, type2) \ +TEMPLATE2(type1, 4, type2) \ +TEMPLATE2(type1, 8, type2) \ +TEMPLATE2(type1, 16, type2) \ +TEMPLATE4(type1, 2, type2) \ +TEMPLATE4(type1, 4, type2) \ +TEMPLATE4(type1, 8, type2) \ +TEMPLATE4(type1, 16, type2) \ +TEMPLATE8(type1, 2, type2) \ +TEMPLATE8(type1, 4, type2) \ +TEMPLATE8(type1, 8, type2) \ +TEMPLATE8(type1, 16, type2) \ +TEMPLATE16(type1, 2, type2) \ +TEMPLATE16(type1, 4, type2) \ +TEMPLATE16(type1, 8, type2) \ +TEMPLATE16(type1, 16, type2) \ + +#define CROSS_MASKTYPE(type) \ +CROSS_SIZE(type, uchar) \ +CROSS_SIZE(type, ushort) \ +CROSS_SIZE(type, uint) \ +CROSS_SIZE(type, ulong) \ + +CROSS_MASKTYPE(char) +CROSS_MASKTYPE(uchar) +CROSS_MASKTYPE(short) +CROSS_MASKTYPE(ushort) +CROSS_MASKTYPE(int) +CROSS_MASKTYPE(uint) +CROSS_MASKTYPE(long) +CROSS_MASKTYPE(ulong) +CROSS_MASKTYPE(float) +CROSS_MASKTYPE(double) diff --git a/src/builtins/sign.cl b/src/builtins/sign.cl new file mode 100644 index 0000000..e440f2f --- /dev/null +++ b/src/builtins/sign.cl @@ -0,0 +1,43 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define EXPAND_SIZES(type) \ + IMPLEMENTATION (_VEC_TYPE(type,3)) \ + IMPLEMENTATION (_VEC_TYPE(type,4)) \ + IMPLEMENTATION (_VEC_TYPE(type,8)) \ + IMPLEMENTATION (_VEC_TYPE(type,16)) \ + +#define IMPLEMENTATION(gentype) \ +_CLC_OVERLOAD _CLC_DEF gentype sign(gentype x) \ +{ return x > (gentype)0.0 ? (gentype) 1.0 : \ + x < (gentype)0.0 ? (gentype)-1.0 : \ + isnan(x) ? (gentype) 0.0 : x; }\ + +EXPAND_SIZES(float) +EXPAND_SIZES(double) diff --git a/src/builtins/sincos.cl b/src/builtins/sincos.cl new file mode 100644 index 0000000..1552f6b --- /dev/null +++ b/src/builtins/sincos.cl @@ -0,0 +1,128 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_PROTECTED void sincosf(float x, float * sinval, float * cosval); +_CLC_PROTECTED void builtin_sincos(double x, double * sinval, double * cosval); + +#define SINCOS_SCALAR_BODY(type, op) \ +{ \ + type sin_val; \ + type cos_val; \ + op(x, &sin_val, &cos_val); \ + *cosval = cos_val; \ + return sin_val; \ +} \ + +#define SINCOS_VECTOR_BODY_2(prim_type, op) \ + op(x.s0, &(((prim_type*)&sin_val)[0]), &(((prim_type*)&cos_val)[0])); \ + op(x.s1, &(((prim_type*)&sin_val)[1]), &(((prim_type*)&cos_val)[1])); \ + +#define SINCOS_VECTOR_BODY_3(prim_type, op) \ + SINCOS_VECTOR_BODY_2(prim_type, op) \ + op(x.s2, &(((prim_type*)&sin_val)[2]), &(((prim_type*)&cos_val)[2])); \ + +#define SINCOS_VECTOR_BODY_4(prim_type, op) \ + SINCOS_VECTOR_BODY_3(prim_type, op) \ + op(x.s3, &(((prim_type*)&sin_val)[3]), &(((prim_type*)&cos_val)[3])); \ + +#define SINCOS_VECTOR_BODY_8(prim_type, op) \ + SINCOS_VECTOR_BODY_4(prim_type, op) \ + op(x.s4, &(((prim_type*)&sin_val)[4]), &(((prim_type*)&cos_val)[4])); \ + op(x.s5, &(((prim_type*)&sin_val)[5]), &(((prim_type*)&cos_val)[5])); \ + op(x.s6, &(((prim_type*)&sin_val)[6]), &(((prim_type*)&cos_val)[6])); \ + op(x.s7, &(((prim_type*)&sin_val)[7]), &(((prim_type*)&cos_val)[7])); \ + +#define SINCOS_VECTOR_BODY_16(prim_type, op) \ + SINCOS_VECTOR_BODY_8(prim_type, op) \ + op(x.s8, &(((prim_type*)&sin_val)[8]), &(((prim_type*)&cos_val)[8])); \ + op(x.s9, &(((prim_type*)&sin_val)[9]), &(((prim_type*)&cos_val)[9])); \ + op(x.sa, &(((prim_type*)&sin_val)[10]), &(((prim_type*)&cos_val)[10])); \ + op(x.sb, &(((prim_type*)&sin_val)[11]), &(((prim_type*)&cos_val)[11])); \ + op(x.sc, &(((prim_type*)&sin_val)[12]), &(((prim_type*)&cos_val)[12])); \ + op(x.sd, &(((prim_type*)&sin_val)[13]), &(((prim_type*)&cos_val)[13])); \ + op(x.se, &(((prim_type*)&sin_val)[14]), &(((prim_type*)&cos_val)[14])); \ + op(x.sf, &(((prim_type*)&sin_val)[15]), &(((prim_type*)&cos_val)[15])); \ + +#define SINCOS_VECTOR_BODY(prim_type, num, op) \ +{ \ + prim_type##num sin_val; \ + prim_type##num cos_val; \ + SINCOS_VECTOR_BODY_##num(prim_type, op)\ + *cosval = cos_val; \ + return sin_val; \ +} \ + +_CLC_OVERLOAD _CLC_INLINE float sincos(float x, global float * cosval) SINCOS_SCALAR_BODY(float, sincosf) +_CLC_OVERLOAD _CLC_INLINE float sincos(float x, local float * cosval) SINCOS_SCALAR_BODY(float, sincosf) +_CLC_OVERLOAD _CLC_INLINE float sincos(float x, private float * cosval) SINCOS_SCALAR_BODY(float, sincosf) + +_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, global float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf) +_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, local float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf) +_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, private float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf) + +_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, global float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf) +_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, local float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf) +_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, private float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf) + +_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, global float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf) +_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, local float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf) +_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, private float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf) + +_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, global float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf) +_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, local float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf) +_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, private float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf) + +_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, global float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf) +_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, local float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf) +_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, private float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf) + +_CLC_OVERLOAD _CLC_DEF double sincos(double x, global double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double sincos(double x, local double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double sincos(double x, private double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos) + +_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, global double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, local double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, private double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos) + +_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, global double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, local double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, private double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos) + +_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, global double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, local double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, private double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos) + +_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, global double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, local double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, private double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos) + +_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, global double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, local double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos) +_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, private double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos) + diff --git a/src/builtins/smoothstep.cl b/src/builtins/smoothstep.cl new file mode 100644 index 0000000..96e3d2a --- /dev/null +++ b/src/builtins/smoothstep.cl @@ -0,0 +1,77 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +_CLC_OVERLOAD _CLC_DEF float smoothstep(float edge0, float edge1, float x) +{ + float t = clamp((float)((x-edge0)/(edge1-edge0)), 0.0f, 1.0f); + return t * t * (3.0f - 2.0f*t); +} + +_CLC_OVERLOAD _CLC_DEF double smoothstep(double edge0, double edge1, double x) +{ + double t = clamp((double)((x-edge0)/(edge1-edge0)), 0.0, 1.0); + return t * t * (3.0 - 2.0*t); +} + +#define FLOAT_TEMPLATE(N) \ +_CLC_OVERLOAD _CLC_DEF float##N smoothstep(float##N edge0, float##N edge1, float##N x) \ +{\ + float##N t = clamp((x-edge0)/(edge1-edge0), 0.0f, 1.0f); \ + return t*t*(3.0f - 2.0f * t); \ +}\ +_CLC_OVERLOAD _CLC_DEF float##N smoothstep(float edge0, float edge1, float##N x) \ +{\ + float##N t = clamp((x-edge0)/(edge1-edge0), 0.0f, 1.0f); \ + return t*t*(3.0f - 2.0f * t);\ +}\ + + +#define DOUBLE_TEMPLATE(N) \ +_CLC_OVERLOAD _CLC_DEF double##N smoothstep(double##N edge0, double##N edge1, double##N x) \ +{\ + double##N t = clamp((x-edge0)/(edge1-edge0), 0.0, 1.0); \ + return t*t*(3.0 - 2.0 * t);\ +}\ +_CLC_OVERLOAD _CLC_DEF double##N smoothstep(double edge0, double edge1, double##N x) \ +{\ + double##N t = clamp((x-edge0)/(edge1-edge0), 0.0, 1.0); \ + return t*t*(3.0 - 2.0 * t);\ +} + +FLOAT_TEMPLATE(2) +FLOAT_TEMPLATE(3) +FLOAT_TEMPLATE(4) +FLOAT_TEMPLATE(8) +FLOAT_TEMPLATE(16) + +DOUBLE_TEMPLATE(2) +DOUBLE_TEMPLATE(3) +DOUBLE_TEMPLATE(4) +DOUBLE_TEMPLATE(8) +DOUBLE_TEMPLATE(16) diff --git a/src/builtins/step.cl b/src/builtins/step.cl new file mode 100644 index 0000000..daecefd --- /dev/null +++ b/src/builtins/step.cl @@ -0,0 +1,43 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define EXPAND_SIZES(type) \ + IMPLEMENTATION(_VEC_TYPE(type,3), type) \ + IMPLEMENTATION(_VEC_TYPE(type,4), type) \ + IMPLEMENTATION(_VEC_TYPE(type,8), type) \ + IMPLEMENTATION(_VEC_TYPE(type,16), type) \ + +#define IMPLEMENTATION(gentype, sgentype) \ +_CLC_OVERLOAD _CLC_DEF gentype step(gentype edge, gentype x) \ + { return x < edge ? (gentype)0.0 : (gentype)1.0 ; } \ +_CLC_OVERLOAD _CLC_DEF gentype step(sgentype edge, gentype x) \ + { return x < (gentype)edge ? (gentype)0.0 : (gentype)1.0 ; } \ + +EXPAND_SIZES(float) +EXPAND_SIZES(double) diff --git a/src/builtins/sub_sat.cl b/src/builtins/sub_sat.cl new file mode 100644 index 0000000..78442f0 --- /dev/null +++ b/src/builtins/sub_sat.cl @@ -0,0 +1,37 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +BINARY_VEC_DEF(char, char, sub_sat, sub_sat) +BINARY_VEC_DEF(uchar, uchar, sub_sat, sub_sat) +BINARY_VEC_DEF(short, short, sub_sat, sub_sat) +BINARY_VEC_DEF(ushort, ushort,sub_sat, sub_sat) +BINARY_VEC_DEF(int, int, sub_sat, sub_sat) +BINARY_VEC_DEF(uint, uint, sub_sat, sub_sat) +BINARY_VEC_DEF(long, long, sub_sat, sub_sat) +BINARY_VEC_DEF(ulong, ulong, sub_sat, sub_sat) diff --git a/src/builtins/upsample.cl b/src/builtins/upsample.cl new file mode 100644 index 0000000..8415a33 --- /dev/null +++ b/src/builtins/upsample.cl @@ -0,0 +1,56 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cpu.h" + +/*----------------------------------------------------------------------------- +* Expand vector type implementations +*----------------------------------------------------------------------------*/ +#define TEMPLATE(xtype,ytype,restype) \ +_CLC_OVERLOAD _CLC_DEF restype upsample(xtype x, ytype y) \ +{ return (restype)(upsample(x.lo,y.lo), upsample(x.hi,y.hi)); } + +#define TEMPLATE3(xtype,ytype,restype) \ +_CLC_OVERLOAD _CLC_DEF restype upsample(xtype x, ytype y) \ +{ return (restype)(upsample(x.s0,y.s0), upsample(x.s1,y.s1), upsample(x.s2,y.s2)); } + +#define EXPAND_SIZES(xtype, ytype, restype)\ + TEMPLATE(_VEC_TYPE(xtype,2), _VEC_TYPE(ytype,2), _VEC_TYPE(restype,2))\ + TEMPLATE3(_VEC_TYPE(xtype,3), _VEC_TYPE(ytype,3), _VEC_TYPE(restype,3))\ + TEMPLATE(_VEC_TYPE(xtype,4), _VEC_TYPE(ytype,4), _VEC_TYPE(restype,4))\ + TEMPLATE(_VEC_TYPE(xtype,8), _VEC_TYPE(ytype,8), _VEC_TYPE(restype,8))\ + TEMPLATE(_VEC_TYPE(xtype,16), _VEC_TYPE(ytype,16), _VEC_TYPE(restype,16))\ + +#define _EXPAND_UPSAMPLE_TYPES() \ + EXPAND_SIZES(char, uchar, short) \ + EXPAND_SIZES(uchar, uchar, ushort) \ + EXPAND_SIZES(short, ushort, int) \ + EXPAND_SIZES(ushort, ushort, uint) \ + EXPAND_SIZES(int, uint, long) \ + EXPAND_SIZES(uint, uint, ulong) \ + +_EXPAND_UPSAMPLE_TYPES() diff --git a/src/builtins/vload.cl b/src/builtins/vload.cl new file mode 100644 index 0000000..2cd9a3a --- /dev/null +++ b/src/builtins/vload.cl @@ -0,0 +1,127 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk> + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "clc.h" + +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \ + } \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##4)(x[(offset<<2)], x[1+(offset<<2)], x[2+(offset<<2)], x[3+(offset<<2)]); \ + } \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##8)(x[(offset<<3)], x[1+(offset<<3)], x[2+(offset<<3)], x[3+(offset<<3)],\ + x[4+(offset<<3)], x[5+(offset<<3)], x[6+(offset<<3)], x[7+(offset<<3)]); \ + } \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return (PRIM_TYPE##16)(x[(offset<<4)], x[1+(offset<<4)], x[2+(offset<<4)], x[3+(offset<<4)],\ + x[4+(offset<<4)], x[5+(offset<<4)], x[6+(offset<<4)], x[7+(offset<<4)], \ + x[8+(offset<<4)], x[9+(offset<<4)], x[10+(offset<<4)], x[11+(offset<<4)], \ + x[12+(offset<<4)], x[13+(offset<<4)], x[14+(offset<<4)], x[15+(offset<<4)]); \ + } \ + +#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ + +#define VLOAD_TYPES() \ + VLOAD_ADDR_SPACES(char) \ + VLOAD_ADDR_SPACES(uchar) \ + VLOAD_ADDR_SPACES(short) \ + VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(int) \ + VLOAD_ADDR_SPACES(uint) \ + VLOAD_ADDR_SPACES(long) \ + VLOAD_ADDR_SPACES(ulong) \ + VLOAD_ADDR_SPACES(float) \ + VLOAD_ADDR_SPACES(double)\ + +VLOAD_TYPES() + +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_INLINE void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + mem[3*offset] = vec.s0; \ + mem[(3*offset)+1] = vec.s1; \ + mem[(3*offset)+2] = vec.s2; \ + } \ + _CLC_OVERLOAD _CLC_INLINE void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + mem[offset<<2] = vec.s0; \ + mem[1+(offset<<2)] = vec.s1; \ + mem[2+(offset<<2)] = vec.s2; \ + mem[3+(offset<<2)] = vec.s3; \ + } \ + _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + mem[(offset<<3)] = vec.s0; \ + mem[1+(offset<<3)] = vec.s1; \ + mem[2+(offset<<3)] = vec.s2; \ + mem[3+(offset<<3)] = vec.s3; \ + mem[4+(offset<<3)] = vec.s4; \ + mem[5+(offset<<3)] = vec.s5; \ + mem[6+(offset<<3)] = vec.s6; \ + mem[7+(offset<<3)] = vec.s7; \ + } \ + _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ + mem[(offset<<4)] = vec.s0; \ + mem[1+(offset<<4)] = vec.s1; \ + mem[2+(offset<<4)] = vec.s2; \ + mem[3+(offset<<4)] = vec.s3; \ + mem[4+(offset<<4)] = vec.s4; \ + mem[5+(offset<<4)] = vec.s5; \ + mem[6+(offset<<4)] = vec.s6; \ + mem[7+(offset<<4)] = vec.s7; \ + mem[8+(offset<<4)] = vec.s8; \ + mem[9+(offset<<4)] = vec.s9; \ + mem[10+(offset<<4)] = vec.sa; \ + mem[11+(offset<<4)] = vec.sb; \ + mem[12+(offset<<4)] = vec.sc; \ + mem[13+(offset<<4)] = vec.sd; \ + mem[14+(offset<<4)] = vec.se; \ + mem[15+(offset<<4)] = vec.sf; \ + } \ + +#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ + +#define VSTORE_TYPES() \ + VSTORE_ADDR_SPACES(char) \ + VSTORE_ADDR_SPACES(uchar) \ + VSTORE_ADDR_SPACES(short) \ + VSTORE_ADDR_SPACES(ushort) \ + VSTORE_ADDR_SPACES(int) \ + VSTORE_ADDR_SPACES(uint) \ + VSTORE_ADDR_SPACES(long) \ + VSTORE_ADDR_SPACES(ulong) \ + VSTORE_ADDR_SPACES(float) \ + VSTORE_ADDR_SPACES(double) \ + +VSTORE_TYPES() diff --git a/src/core/commandqueue.cpp b/src/core/commandqueue.cpp new file mode 100644 index 0000000..662dad1 --- /dev/null +++ b/src/core/commandqueue.cpp @@ -0,0 +1,1018 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file commandqueue.cpp + * \brief Command queue + */ + +#include "commandqueue.h" +#include "context.h" +#include "deviceinterface.h" +#include "propertylist.h" +#include "events.h" + +#include <cstring> +#include <cstdlib> +#include <ctime> +#include <iostream> +#include <stdio.h> + +using namespace Coal; + +#define OOO_QUEUE_PUSH_EVENTS_THRESHOLD 64 + +/****************************************************************************** +* CommandQueue::CommandQueue +******************************************************************************/ +CommandQueue::CommandQueue(Context *ctx, + DeviceInterface *device, + cl_command_queue_properties properties, + cl_int *errcode_ret) +: Object(Object::T_CommandQueue, ctx), p_device(device), + p_num_events_in_queue(0), p_num_events_on_device(0), + p_num_events_completed(0), + p_properties(properties), p_flushed(true) +{ + // Initialize the locking machinery + pthread_mutex_init(&p_event_list_mutex, 0); + pthread_cond_init(&p_event_list_cond, 0); + + // Check that the device belongs to the context + if (!ctx->hasDevice(device)) + { + *errcode_ret = CL_INVALID_DEVICE; + return; + } + p_device->init(); + + *errcode_ret = checkProperties(); +} + +/****************************************************************************** +* CommandQueue::~CommandQueue() +******************************************************************************/ +CommandQueue::~CommandQueue() +{ + cleanReleasedEvents(); + // Free the mutex + pthread_mutex_destroy(&p_event_list_mutex); + pthread_cond_destroy(&p_event_list_cond); +} + +/****************************************************************************** +* cl_int CommandQueue::info +******************************************************************************/ +cl_int CommandQueue::info(cl_command_queue_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union { + cl_uint cl_uint_var; + cl_device_id cl_device_id_var; + cl_context cl_context_var; + cl_command_queue_properties cl_command_queue_properties_var; + }; + + switch (param_name) + { + case CL_QUEUE_CONTEXT: + SIMPLE_ASSIGN(cl_context, parent()); + break; + + case CL_QUEUE_DEVICE: + SIMPLE_ASSIGN(cl_device_id, p_device); + break; + + case CL_QUEUE_REFERENCE_COUNT: + SIMPLE_ASSIGN(cl_uint, references()); + break; + + case CL_QUEUE_PROPERTIES: + SIMPLE_ASSIGN(cl_command_queue_properties, p_properties); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +/****************************************************************************** +* cl_int CommandQueue::setProperty +******************************************************************************/ +cl_int CommandQueue::setProperty(cl_command_queue_properties properties, + cl_bool enable, + cl_command_queue_properties *old_properties) +{ + if (old_properties) + *old_properties = p_properties; + + if (enable) + p_properties |= properties; + else + p_properties &= ~properties; + + return checkProperties(); +} + +/****************************************************************************** +* cl_int CommandQueue::checkProperties +******************************************************************************/ +cl_int CommandQueue::checkProperties() const +{ + // Check that all the properties are valid + cl_command_queue_properties properties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_PROFILING_ENABLE; + + if ((p_properties & properties) != p_properties) + return CL_INVALID_VALUE; + + // Check that the device handles these properties + cl_int result; + + result = p_device->info(CL_DEVICE_QUEUE_PROPERTIES, + sizeof(cl_command_queue_properties), + &properties, + 0); + + if (result != CL_SUCCESS) + return result; + + if ((p_properties & properties) != p_properties) + return CL_INVALID_QUEUE_PROPERTIES; + + return CL_SUCCESS; +} + +/****************************************************************************** +* void CommandQueue::flush() +******************************************************************************/ +void CommandQueue::flush() +{ + // Wait for the command queue to be in state "flushed". + pthread_mutex_lock(&p_event_list_mutex); + + while (!p_flushed) + pthread_cond_wait(&p_event_list_cond, &p_event_list_mutex); + + pthread_mutex_unlock(&p_event_list_mutex); + + cleanReleasedEvents(); +} + +/****************************************************************************** +* void CommandQueue::finish() +******************************************************************************/ +void CommandQueue::finish() +{ + // As pushEventsOnDevice doesn't remove SUCCESS events, we may need + // to do that here in order not to be stuck. + cleanEvents(); + + // All the queued events must have completed. When they are, they get + // deleted from the command queue, so simply wait for it to become empty. + pthread_mutex_lock(&p_event_list_mutex); + + while (p_num_events_in_queue != 0) + pthread_cond_wait(&p_event_list_cond, &p_event_list_mutex); + + pthread_mutex_unlock(&p_event_list_mutex); + + cleanReleasedEvents(); +} + +/****************************************************************************** +* cl_int CommandQueue::queueEvent(Event *event) +******************************************************************************/ +cl_int CommandQueue::queueEvent(Event *event) +{ + // Let the device initialize the event (for instance, a pointer at which + // memory would be mapped) + cl_int rs = p_device->initEventDeviceData(event); + + if (rs != CL_SUCCESS) + return rs; + + // Append the event at the end of the list + pthread_mutex_lock(&p_event_list_mutex); + + p_events.push_back(event); + p_num_events_in_queue += 1; + p_flushed = false; + + pthread_mutex_unlock(&p_event_list_mutex); + + // Timing info if needed + if (p_properties & CL_QUEUE_PROFILING_ENABLE) + event->updateTiming(Event::Queue); + + // Explore the list for events we can push on the device + pushEventsOnDevice(); + + cleanReleasedEvents(); + + return CL_SUCCESS; +} + +/****************************************************************************** +* void CommandQueue::releaseEvent() +******************************************************************************/ +void CommandQueue::releaseEvent(Event *e) +{ + pthread_mutex_lock(&p_event_list_mutex); + p_released_events.push_back(e); + pthread_mutex_unlock(&p_event_list_mutex); +} + +/****************************************************************************** +* void CommandQueue::cleanEvents() +******************************************************************************/ +void CommandQueue::cleanEvents() +{ + bool is_inorder = + (p_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0; + + pthread_mutex_lock(&p_event_list_mutex); + + // No need to cleanEvents() every time an event finishes, so that we can + // save on the event traversal time. 16 is a number that can be tuned + // (e.g. using ooo example). + if (p_num_events_completed < 16 && p_num_events_on_device > 0 && + p_num_events_in_queue - p_num_events_completed > 0) + { + pthread_mutex_unlock(&p_event_list_mutex); + return; + } + + std::list<Event *>::iterator it = p_events.begin(), oldit; + + while (it != p_events.end()) + { + Event *event = *it; + + if (event->status() == Event::Complete) + { + // We cannot be deleted from inside us + event->setReleaseParent(false); + oldit = it; + ++it; + + p_num_events_in_queue -= 1; + p_num_events_completed -= 1; + p_events.erase(oldit); + // put Completed events into another list + // let main thread release/delete them + p_released_events.push_back(event); + } + else if (is_inorder) + { + // In Order Queue events are dispatched and completed in Order + break; + } + else + { + ++it; + } + } + + // We have cleared the list, so wake up the sleeping threads + if (p_num_events_in_queue == 0) + pthread_cond_broadcast(&p_event_list_cond); + + pthread_mutex_unlock(&p_event_list_mutex); + + // Check now if we have to be deleted + if (references() == 0) + { + delete this; + } +} + +/****************************************************************************** +* void CommandQueue::cleanReleasedEvents() +* !!! Can only be called by the main thread!!! new/delete, malloc/free are not +* thread safe on ARM, so let main thread handle them SOLELY! +******************************************************************************/ +void CommandQueue::cleanReleasedEvents() +{ + pthread_mutex_lock(&p_event_list_mutex); + + while (! p_released_events.empty()) + { + Event *event = p_released_events.front(); + clReleaseEvent((cl_event)event); + p_released_events.pop_front(); + } + + pthread_mutex_unlock(&p_event_list_mutex); +} + +/****************************************************************************** +* void CommandQueue::pushEventsOnDevice() +* Who is calling this function: +* (ready_event, one_event_completed_on_device) +* (not NULL, * ): worker thread, push till this one ready event +* ( NULL, true ): worker thread, one completes, push rest on this queue +* ( NULL, false): main thread, queued a new event, push this queue +******************************************************************************/ +void CommandQueue::pushEventsOnDevice(Event *ready_event, + bool one_event_completed_on_device) +{ + int non_complete_events_traversed = 0; + bool is_ooo = (p_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0; + bool do_profile = (p_properties & CL_QUEUE_PROFILING_ENABLE) != 0; + + pthread_mutex_lock(&p_event_list_mutex); + + if (one_event_completed_on_device) + { + p_num_events_on_device -= 1; + p_num_events_completed += 1; + } + + // No need to push more events on Device if 1) device has already got + // enough to work on, and 2) not pushing won't cause starvation of this + // commandqueue. Not pushing can save p_event_list traversal time. + // 2 is a QoS number, set to 2 for the time being + // imagaine there are multiple commandqueues on same device + if(is_ooo && ready_event == NULL && + p_num_events_on_device > 2 && p_device->gotEnoughToWorkOn()) + { + pthread_mutex_unlock(&p_event_list_mutex); + return; + } + + // Explore the events in p_events and push on the device all of them that + // are : + // + // - Not already pushed (in Event::Queued state) + // - Not after a barrier, except if we begin with a barrier + // - If we are in-order, only the first event in Event::Queued state can + // be pushed + + std::list<Event *>::iterator it = p_events.begin(); + std::list<Event *>::iterator oldit; + bool first = true; + + // We assume that we will flush the command queue (submit all the events) + // This will be changed in the while() when we know that not all events + // are submitted. + p_flushed = true; + + while (it != p_events.end()) + { + Event *event = *it; + + // If the event is completed, remove it + if (event->status() == Event::Complete) + { + event->setReleaseParent(false); + oldit = it; + ++it; + + p_num_events_completed -= 1; + p_num_events_in_queue -= 1; + p_events.erase(oldit); + // put Completed events into another list + // let main thread release/delete them + p_released_events.push_back(event); + continue; + } + + // If OOO queue threshold is met, skip examining the rest of events + if(ready_event == NULL && + non_complete_events_traversed > OOO_QUEUE_PUSH_EVENTS_THRESHOLD) + break; + non_complete_events_traversed += 1; + + // We cannot do out-of-order, so we can only push the first event. + if (!is_ooo && !first) + { + p_flushed = false; // There are remaining events. + break; + } + + // Stop if we encounter a barrier that isn't the first event in the list. + if (event->type() == Event::Barrier && !first) + { + // We have events to wait, stop + p_flushed = false; + break; + } + + // Completed events and first barriers are out, it remains real events + // that have to block in-order execution. + first = false; + + // If the event is not "pushable" (in Event::Queued state), skip it + // It is either Submitted or Running. + if (event->status() != Event::Queued) + { + // Intended event is scheduled, skip the rest in queue + if (event == ready_event) break; + + ++it; + continue; + } + + // Check that all the waiting-on events of this event are finished + if (! event->waitEventsAllCompleted()) + { + p_flushed = false; + // If we encounter a WaitForEvents event that is not "finished", + // don't push events after it. + if (event->type() == Event::WaitForEvents) + break; + + // The event has its dependencies not already met. + ++it; + continue; + } + + if (event->isInstantaneous()) + { + // Set the event as completed. This will call pushEventsOnDevice, + // again, so release the lock to avoid a deadlock. We also return + // because the recursive call will continue our work. + pthread_mutex_unlock(&p_event_list_mutex); + event->setStatus(Event::Complete); + return; + } + + // The event can be pushed, if we need to + if (do_profile) event->updateTiming(Event::Submit); + + event->setStatus(Event::Submitted); + p_num_events_on_device += 1; + p_device->pushEvent(event); + } + + if (ready_event != NULL && p_flushed) + p_flushed = (p_num_events_in_queue == 0); + + if (p_flushed) + pthread_cond_broadcast(&p_event_list_cond); + + pthread_mutex_unlock(&p_event_list_mutex); +} + +/****************************************************************************** +* Event **CommandQueue::events(unsigned int &count) +******************************************************************************/ +Event **CommandQueue::events(unsigned int &count, + bool include_completed_events) +{ + Event **result = NULL; + + pthread_mutex_lock(&p_event_list_mutex); + + count = p_num_events_in_queue; + if (count > 0) + result = (Event **)std::malloc(count * sizeof(Event *)); + + // Copy each event of the list into result, retaining them + unsigned int index = 0; + std::list<Event *>::iterator it = p_events.begin(); + + while (it != p_events.end()) + { + if (! include_completed_events) + { + Event *e = *it; + if (e->status() == Event::Complete) + { + ++it; + continue; + } + } + + result[index] = *it; + result[index]->reference(); + + ++it; + ++index; + } + count = index; + + // Now result contains an immutable list of events. Even if the events + // become completed in another thread while result is used, the events + // are retained and so guaranteed to remain valid. + pthread_mutex_unlock(&p_event_list_mutex); + + return result; +} + +/****************************************************************************** +* Event::Event +******************************************************************************/ +Event::Event(CommandQueue *parent, + Status status, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: Object(Object::T_Event, parent), + p_status(status), p_device_data(0) +{ + // Initialize the locking machinery + pthread_cond_init(&p_state_change_cond, 0); + pthread_mutex_init(&p_state_mutex, 0); + + std::memset(&p_timing, 0, sizeof(p_timing)); + + // Check sanity of parameters + if (!event_wait_list && num_events_in_wait_list) + { + *errcode_ret = CL_INVALID_EVENT_WAIT_LIST; + return; + } + + if (event_wait_list && !num_events_in_wait_list) + { + *errcode_ret = CL_INVALID_EVENT_WAIT_LIST; + return; + } + + // Check that none of the events in event_wait_list is in an error state + for (cl_uint i=0; i<num_events_in_wait_list; ++i) + { + if (event_wait_list[i] == 0) + { + *errcode_ret = CL_INVALID_EVENT_WAIT_LIST; + return; + } + else if (event_wait_list[i]->status() < 0) + { + *errcode_ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; + return; + } + } + + if (parent && num_events_in_wait_list > 0) + { + pthread_mutex_lock(&p_state_mutex); + for (cl_uint i=0; i<num_events_in_wait_list; ++i) + { + // if event_wait_list[i] is already COMPLETE, don't add it!!! + if (event_wait_list[i]->addDependentEvent(this)) + p_wait_events.push_back(event_wait_list[i]); + } + pthread_mutex_unlock(&p_state_mutex); + } +} + +/****************************************************************************** +* void Event::freeDeviceData() +******************************************************************************/ +void Event::freeDeviceData() +{ + if (parent() && p_device_data) + { + DeviceInterface *device = 0; + ((CommandQueue *)parent())->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), &device, 0); + + device->freeEventDeviceData(this); + } +} + +/****************************************************************************** +* Event::~Event() +******************************************************************************/ +Event::~Event() +{ + pthread_mutex_destroy(&p_state_mutex); + pthread_cond_destroy(&p_state_change_cond); +} + +/****************************************************************************** +* bool Event::isInstantaneous() +******************************************************************************/ +bool Event::isInstantaneous() const +{ + // A dummy event has nothing to do on an execution device and must be + // completed directly after being "submitted". + + switch (type()) + { + case Marker: + case User: + case Barrier: + case WaitForEvents: + return true; + + default: + return false; + } +} + +/****************************************************************************** +* void Event::setStatus +******************************************************************************/ +int Event::setStatusHelper(Status status) +{ + int num_dependent_events; + + // TODO: If status < 0, terminate all the events depending on us. + pthread_mutex_lock(&p_state_mutex); + p_status = status; + num_dependent_events = p_dependent_events.size(); + + pthread_cond_broadcast(&p_state_change_cond); + + // Call the callbacks + std::multimap<Status, CallbackData>::const_iterator it; + std::pair<std::multimap<Status, CallbackData>::const_iterator, + std::multimap<Status, CallbackData>::const_iterator> ret; + + ret = p_callbacks.equal_range(status > 0 ? status : Complete); + + for (it=ret.first; it!=ret.second; ++it) + { + const CallbackData &data = (*it).second; + data.callback((cl_event)this, p_status, data.user_data); + } + + pthread_mutex_unlock(&p_state_mutex); + + return num_dependent_events; +} + +void Event::setStatus(Status status) +{ + if (type() == Event::User || (parent() && status == Complete)) + { + CommandQueue *cq = (CommandQueue *) parent(); + + int num_dependent_events = setStatusHelper(status); + /*--------------------------------------------------------------------- + * From this point on, the event could be dereferenced to 0 and deleted! + * Thus we cannot call flushQueues(). Need to save these queues. + *--------------------------------------------------------------------*/ + + /*--------------------------------------------------------------------- + * Notify dependent events, remove dependence, and push them if possible + *--------------------------------------------------------------------*/ + for (int i = 0; i < num_dependent_events; i += 1) + { + Event *d_event = p_dependent_events[i]; + CommandQueue *q = (CommandQueue *) d_event->parent(); + if (d_event->removeWaitEvent(this) && q != NULL) // order! + { + q->pushEventsOnDevice(d_event, (cq == q)); + if (cq == q) cq = NULL; + } + } + + /*--------------------------------------------------------------------- + * Inform our parent to push other events to the device if haven't done + * so already. UserEvent's parent is NULL. + *--------------------------------------------------------------------*/ + if (cq != NULL) cq->pushEventsOnDevice(NULL, true); + } + else + setStatusHelper(status); +} + +bool Event::addDependentEvent(Event *event) +{ + pthread_mutex_lock(&p_state_mutex); + if (p_status == Event::Complete) + { + pthread_mutex_unlock(&p_state_mutex); + return false; + } + + p_dependent_events.push_back(event); + Object::reference(); // retain this event + pthread_mutex_unlock(&p_state_mutex); + return true; +} + +bool Event::removeWaitEvent(Event *event) +{ + bool empty; + + pthread_mutex_lock(&p_state_mutex); + p_wait_events.remove(event); + empty = p_wait_events.empty(); + pthread_mutex_unlock(&p_state_mutex); + + CommandQueue *q = (CommandQueue *) event->parent(); + if (q != NULL) q->releaseEvent(event); + return empty; +} + +bool Event::waitEventsAllCompleted() +{ +// YUAN TODO: p_wait_events is always shrinking, is lock necessary? +// it is a little bit faster without having to lock!!! +#if 1 + bool empty; + + pthread_mutex_lock(&p_state_mutex); + empty = p_wait_events.empty(); + pthread_mutex_unlock(&p_state_mutex); + + return empty; +#else + return p_wait_events.empty(); +#endif +} + +/****************************************************************************** +* void Event::reference, dereference +* This should be protected, since main thread and worker threads could all +* updating the event reference count +******************************************************************************/ +void Event::reference() +{ + pthread_mutex_lock(&p_state_mutex); + Object::reference(); + pthread_mutex_unlock(&p_state_mutex); +} + +bool Event::dereference() +{ + bool retval = false; + pthread_mutex_lock(&p_state_mutex); + retval = Object::dereference(); + pthread_mutex_unlock(&p_state_mutex); + return retval; +} + +/****************************************************************************** +* void Event::setDeviceData +******************************************************************************/ +void Event::setDeviceData(void *data) +{ + p_device_data = data; +} + +/****************************************************************************** +* void Event::updateTiming +******************************************************************************/ +void Event::updateTiming(Timing timing) +{ + if (timing >= Max) + return; + + pthread_mutex_lock(&p_state_mutex); + + // Don't update more than one time (NDRangeKernel for example) + if (p_timing[timing]) + { + pthread_mutex_unlock(&p_state_mutex); + return; + } + + struct timespec tp; + cl_ulong rs; + + if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0) + clock_gettime(CLOCK_REALTIME, &tp); + + rs = tp.tv_nsec / 1000; // convert to microseconds + rs += tp.tv_sec * 1000000; // convert to microseconds + + p_timing[timing] = rs; + + pthread_mutex_unlock(&p_state_mutex); +} + +/****************************************************************************** +* Event::Status Event::status() const +******************************************************************************/ +Event::Status Event::status() const +{ + // HACK : We need const qualifier but we also need to lock a mutex + Event *me = (Event *)(void *)this; + + pthread_mutex_lock(&me->p_state_mutex); + + Status ret = p_status; + + pthread_mutex_unlock(&me->p_state_mutex); + + return ret; +} + +/****************************************************************************** +* void Event::waitForStatus(Status status) +******************************************************************************/ +void Event::waitForStatus(Status status) +{ + pthread_mutex_lock(&p_state_mutex); + + while (p_status != status && p_status > 0) + { + pthread_cond_wait(&p_state_change_cond, &p_state_mutex); + } + + pthread_mutex_unlock(&p_state_mutex); +} + +/****************************************************************************** +* void *Event::deviceData() +******************************************************************************/ +void *Event::deviceData() +{ + return p_device_data; +} + +/****************************************************************************** +* void Event::setCallback +******************************************************************************/ +void Event::setCallback(cl_int command_exec_callback_type, + event_callback callback, + void *user_data) +{ + CallbackData data; + + data.callback = callback; + data.user_data = user_data; + + pthread_mutex_lock(&p_state_mutex); + + p_callbacks.insert(std::pair<Status, CallbackData>( + (Status)command_exec_callback_type, + data)); + + pthread_mutex_unlock(&p_state_mutex); +} + +/****************************************************************************** +* cl_int Event::info +******************************************************************************/ +cl_int Event::info(cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union { + cl_command_queue cl_command_queue_var; + cl_context cl_context_var; + cl_command_type cl_command_type_var; + cl_int cl_int_var; + cl_uint cl_uint_var; + }; + + switch (param_name) + { + case CL_EVENT_COMMAND_QUEUE: + SIMPLE_ASSIGN(cl_command_queue, parent()); + break; + + case CL_EVENT_CONTEXT: + if (parent()) + { + SIMPLE_ASSIGN(cl_context, parent()->parent()); + } + else + { + if (type() == User) + SIMPLE_ASSIGN(cl_context, ((UserEvent *)this)->context()) + else + SIMPLE_ASSIGN(cl_context, 0); + } + break; + + case CL_EVENT_COMMAND_TYPE: + SIMPLE_ASSIGN(cl_command_type, type()); + break; + + // avoid status() call, if called from callbacks, we deadlock on mutex + case CL_EVENT_COMMAND_EXECUTION_STATUS: + SIMPLE_ASSIGN(cl_int, p_status); + break; + + case CL_EVENT_REFERENCE_COUNT: + SIMPLE_ASSIGN(cl_uint, references()); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +/****************************************************************************** +* cl_int Event::profilingInfo( +******************************************************************************/ +cl_int Event::profilingInfo(cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + if (type() == Event::User) + return CL_PROFILING_INFO_NOT_AVAILABLE; + + // Check that the Command Queue has profiling enabled + cl_command_queue_properties queue_props; + cl_int rs; + + rs = ((CommandQueue *)parent())->info(CL_QUEUE_PROPERTIES, + sizeof(cl_command_queue_properties), + &queue_props, 0); + + if (rs != CL_SUCCESS) + return rs; + + if ((queue_props & CL_QUEUE_PROFILING_ENABLE) == 0) + return CL_PROFILING_INFO_NOT_AVAILABLE; + + // avoid status() call, if called from callbacks, we deadlock on mutex + if (p_status != Event::Complete) + return CL_PROFILING_INFO_NOT_AVAILABLE; + + void *value = 0; + size_t value_length = 0; + cl_ulong cl_ulong_var; + + switch (param_name) + { + case CL_PROFILING_COMMAND_QUEUED: + SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Queue]); + break; + + case CL_PROFILING_COMMAND_SUBMIT: + SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Submit]); + break; + + case CL_PROFILING_COMMAND_START: + SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Start]); + break; + + case CL_PROFILING_COMMAND_END: + SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[End]); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + diff --git a/src/core/commandqueue.h b/src/core/commandqueue.h new file mode 100644 index 0000000..7d2c65e --- /dev/null +++ b/src/core/commandqueue.h @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file commandqueue.h + * \brief Command queue and base class for events + */ + +#ifndef __COMMANDQUEUE_H__ +#define __COMMANDQUEUE_H__ + +#include "object.h" + +#include <CL/cl.h> +#include <pthread.h> + +#include <map> +#include <list> +#include <vector> + +namespace Coal +{ + +class Context; +class DeviceInterface; +class Event; + +/** + * \brief Command queue + * + * This class holds a list of events that will be pushed on a given device. + * + * More details are given on the \ref events page. + */ +class CommandQueue : public Object +{ + public: + CommandQueue(Context *ctx, + DeviceInterface *device, + cl_command_queue_properties properties, + cl_int *errcode_ret); + ~CommandQueue(); + + /** + * \brief Queue an event + * \param event event to be queued + * \return \c CL_SUCCESS if success, otherwise an error code + */ + cl_int queueEvent(Event *event); + + /** + * \brief Information about the command queue + * \copydetails Coal::DeviceInterface::info + */ + cl_int info(cl_command_queue_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + /** + * \brief Set properties of the command queue + * \note This function is deprecated and only there for OpenCL 1.0 + * compatibility + * \param properties property to enable or disable + * \param enable true to enable the property, false to disable it + * \param old_properties old value of the properties, ignored if NULL + * \return \c CL_SUCCESS if all is good, an error code if \p properties is + * invalid + */ + cl_int setProperty(cl_command_queue_properties properties, + cl_bool enable, + cl_command_queue_properties *old_properties); + + /** + * \brief Check the properties given + * \return \c CL_SUCCESS if they are valid, an error code otherwise + */ + cl_int checkProperties() const; + + /** + * \brief Push events on the device + * + * This function implements a big part of what is described in + * \ref events . + * + * It is called by \c Coal::Event::setStatus() when an event is + * completed, or by \c queueEvent(). Its purpose is to explore the list + * of queued events (\c p_events) and to call + * \c Coal::DeviceInterface::pushEvent() for each event meeting its push + * conditions. + * + * \param ready_event is know to be pushable, push events in the + * queue till this point, skip the events after this one. + * + * \param one_event_completed_on_device can be used to differentiate + * whether this function is called by worker thread when an event is + * completed, or by main thread's queueEvent(). + * + * \section conditions Conditions + * + * If the command queue has the \c CL_OUT_OF_ORDER_EXEC_MODE_ENABLE + * property disabled, an event can be pushed only if all the previous + * ones in the list are completed with success. This way, an event + * must be completed before any other can be pushed. This ensures + * in-order execution. + * + * If this property is enable, more complex heuristics are used. + * + * The event list \c p_events is explored from top to bottom. At each + * loop iteration, checks are performed to see if the event can be pushed. + * + * - When a \c Coal::BarrierEvent is encountered, no more events can be + * pushed, except if the \c Coal::BarrierEvent is the first in the list, + * as that means there are no other events that can be pushed, so the + * barrier can go away + * - All events that are already pushed or finished are skipped + * - The wait list of the event is then explored to ensure that all its + * dependencies are met. + * - Finally, if the events passes all the tests, it is either pushed on + * the device, or simply set to \c Coal::Event::Complete if it's a + * dummy event (see \c Coal::Event::isInstantaneous()). + */ + void pushEventsOnDevice(Event *ready_event = NULL, + bool one_event_completed_on_device = false); + + /** + * \brief Push an event onto p_release_event list + * + * Later main thread will perform release event action. + */ + void releaseEvent(Event *e); + + /** + * \brief Remove from the event list completed events + * + * This function is called periodically to clean the event list from + * completed events. + * + * It is needed to do that out of \c pushEventsOnDevice() as deleting + * event may \c dereference() this command queue, and also delete it. It + * would produce crashes. + */ + void cleanEvents(); + + /** + * \brief Release events on the released event list + * + * This function is called periodically to release the events on the + * released events list. This is only performed on the main thread + * because deleting/freeing memory from worker thread has caused + * weird memory problems on ARM. + * + */ + void cleanReleasedEvents(); + + /** + * \brief Flush the command queue + * + * Pushes all the events on the device, and then return. The event + * don't need to be completed after this call. + */ + void flush(); + + /** + * \brief Finish the command queue + * + * Pushes the events like \c flush() but also wait for them to be + * completed before returning. + */ + void finish(); + + /** + * \brief Return all the events in the command queue + * \note Retains all the events + * \param count number of events in the event queue + * \param include_completed_events default to true + * \return events currently in the event queue + */ + Event **events(unsigned int &count, + bool include_completed_events = true); + + private: + DeviceInterface *p_device; + cl_int p_num_events_in_queue; + cl_int p_num_events_on_device; + cl_int p_num_events_completed; + cl_command_queue_properties p_properties; + + std::list<Event *> p_events; + std::list<Event *> p_released_events; + pthread_mutex_t p_event_list_mutex; + pthread_cond_t p_event_list_cond; + bool p_flushed; +}; + +/** + * \brief Base class for all events + * + * This class contains logic common to all the events. + * + * Beside handling OpenCL-specific stuff, \c Coal::Event objects do nothing + * implementation-wise. They do not compile kernels, copy data around, etc. + * They only contain static and immutable data that is then used by the devices + * to actually implement the event. + */ +class Event : public Object +{ + public: + /** + * \brief Event type + * + * The allows objects using \c Coal::Event to know which event it is, + * and to cast it to the correct sub-class. + */ + enum Type + { + NDRangeKernel = CL_COMMAND_NDRANGE_KERNEL, + TaskKernel = CL_COMMAND_TASK, + NativeKernel = CL_COMMAND_NATIVE_KERNEL, + ReadBuffer = CL_COMMAND_READ_BUFFER, + WriteBuffer = CL_COMMAND_WRITE_BUFFER, + CopyBuffer = CL_COMMAND_COPY_BUFFER, + ReadImage = CL_COMMAND_READ_IMAGE, + WriteImage = CL_COMMAND_WRITE_IMAGE, + CopyImage = CL_COMMAND_COPY_IMAGE, + CopyImageToBuffer = CL_COMMAND_COPY_IMAGE_TO_BUFFER, + CopyBufferToImage = CL_COMMAND_COPY_BUFFER_TO_IMAGE, + MapBuffer = CL_COMMAND_MAP_BUFFER, + MapImage = CL_COMMAND_MAP_IMAGE, + UnmapMemObject = CL_COMMAND_UNMAP_MEM_OBJECT, + Marker = CL_COMMAND_MARKER, + AcquireGLObjects = CL_COMMAND_ACQUIRE_GL_OBJECTS, + ReleaseGLObjects = CL_COMMAND_RELEASE_GL_OBJECTS, + ReadBufferRect = CL_COMMAND_READ_BUFFER_RECT, + WriteBufferRect = CL_COMMAND_WRITE_BUFFER_RECT, + CopyBufferRect = CL_COMMAND_COPY_BUFFER_RECT, + User = CL_COMMAND_USER, + Barrier, + WaitForEvents + }; + + /** + * \brief Event status + */ + enum Status + { + Queued = CL_QUEUED, /*!< \brief Simply queued in a command queue */ + Submitted = CL_SUBMITTED, /*!< \brief Submitted to a device */ + Running = CL_RUNNING, /*!< \brief Running on the device */ + Complete = CL_COMPLETE /*!< \brief Completed */ + }; + + /** + * \brief Function that can be called when an event change status + */ + typedef void (CL_CALLBACK *event_callback)(cl_event, cl_int, void *); + + /** + * Structure used internally by \c Coal::Event to store for each event + * status the callbacks to call with the corresponding \c user_data. + */ + struct CallbackData + { + event_callback callback; /*!< Function to call */ + void *user_data; /*!< Pointer to pass as its third argument */ + }; + + /** + * \brief Timing counters of an event + */ + enum Timing + { + Queue, /*!< Time when the event was queued */ + Submit, /*!< Time when the event was submitted to the device */ + Start, /*!< Time when its execution began on the device */ + End, /*!< Time when its execution finished */ + Max /*!< Number of items in this enum */ + }; + + public: + /** + * \brief Constructor + * \param parent parent \c Coal::CommandQueue + * \param status \c Status the event has when it is created + * \param num_events_in_wait_list number of events to wait on + * \param event_wait_list list of events to wait on + * \param errcode_ret return value + */ + Event(CommandQueue *parent, + Status status, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + void freeDeviceData(); /*!< \brief Call \c Coal::DeviceInterface::freeEventDeviceData() */ + virtual ~Event(); /*!< \brief Destructor */ + + /** + * \brief Type of the event + * \return type of the event + */ + virtual Type type() const = 0; + + /** + * \brief Dummy event + * + * A dummy event is an event that doesn't have to be pushed on a device, + * it is only a hint for \c Coal::CommandQueue + * + * \return true if the event is dummy + */ + bool isInstantaneous() const; + + /** + * \brief Set the event status + * + * This function calls the event callbacks, and + * \c Coal::CommandQueue::pushEventsOnDevice() if \p status is + * \c Complete . + * + * \param status new status of the event + */ + void setStatus(Status status); + + /** + * \brief Increase Event reference count + * + * This function uses mutex to protect the reference count + * \c update in the underlying object. + */ + void reference(); + + /** + * \brief Decrease Event reference count + * + * This function uses mutex to protect the reference count + * \c update in the underlying object. + * + * \return true if the reference count is decreased to 0 + */ + bool dereference(); + + /** + * \brief Set device-specific data + * \param data device-specific data + */ + void setDeviceData(void *data); + + /** + * \brief Update timing info + * + * This function reads current system time and puts it in \c p_timing + * + * \param timing timing event having just finished + */ + void updateTiming(Timing timing); + + /** + * \brief Status + * \return status of the event + */ + Status status() const; + + /** + * \brief Wait for a specified status + * + * This function blocks until the event's status is set to \p status + * by another thread. + * + * \param status the status the event must have for the function to return + */ + void waitForStatus(Status status); + + /** + * \brief Device-specific data + * \return data set using \c setDeviceData() + */ + void *deviceData(); + + /** + * \brief Add a callback for this event + * \param command_exec_callback_type status the event must have in order + * to have the callback called + * \param callback callback function + * \param user_data user data given to the callback + */ + void setCallback(cl_int command_exec_callback_type, + event_callback callback, + void *user_data); + + /** + * \brief Info about the event + * \copydetails Coal::DeviceInterface::info + */ + cl_int info(cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + /** + * \brief Profiling info + * \copydetails Coal::DeviceInterface::info + */ + cl_int profilingInfo(cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + /** + * \brief Call \c Coal::CommandQueue::pushEventsOnDevice() for each command queue + * in which this event is queued or each queue with an event waiting on this event + */ + void flushQueues(); + + + /** + * \brief Add event to p_dependent_events, which will be notified when + * current event completes. If current event is already complete, + * no need to add and return false. + * \param event the event to be notified + */ + bool addDependentEvent(Event *event); + + /** + * \brief Remove event from p_wait_events, which should be waited on + * before current event can start. When p_wait_events becomes empty, + * return true to indicate that current event is ready to be pushed. + * \param event the event to be removed from p_wait_events + */ + bool removeWaitEvent(Event *event); + + /** + * \brief Check if there are no more events to wait on before current + * event can start. + */ + bool waitEventsAllCompleted(); + + private: + /** + * \brief Helper function for setStatus() + * return number of dependent events + */ + int setStatusHelper(Status status); + + private: + pthread_cond_t p_state_change_cond; + pthread_mutex_t p_state_mutex; + + Status p_status; + void *p_device_data; + std::multimap<Status, CallbackData> p_callbacks; + + cl_uint p_timing[Max]; + + // p_wait_events: I should wait after these events complete + // p_dependent_events: when I complete, I should notify these events + std::list<const Event *> p_wait_events; + std::vector<Event *> p_dependent_events; +}; + +} + +struct _cl_command_queue : public Coal::CommandQueue +{}; + +struct _cl_event : public Coal::Event +{}; + +#endif diff --git a/src/core/compiler.cpp b/src/core/compiler.cpp new file mode 100644 index 0000000..d4d5240 --- /dev/null +++ b/src/core/compiler.cpp @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file compiler.cpp + * \brief Compiler wrapper around Clang + */ + +#include "compiler.h" +#include "deviceinterface.h" + +#include <cstring> +#include <cstdio> +#include <string> +#include <sstream> +#include <iostream> +#include <clang/Frontend/CompilerInvocation.h> +#include <clang/Frontend/TextDiagnosticPrinter.h> +#include <clang/Frontend/LangStandard.h> +#include <clang/Basic/Diagnostic.h> +#include <clang/CodeGen/CodeGenAction.h> +#include <llvm/ADT/SmallVector.h> +#include <llvm/Support/Host.h> +#include <llvm/Support/MemoryBuffer.h> // ASW +#include <llvm/IR/Module.h> +#include <llvm/IR/LLVMContext.h> +#include <sys/stat.h> + +std::string get_ocl_dsp(); + +using namespace Coal; + +Compiler::Compiler(DeviceInterface *device) +: p_device(device), p_module(0), p_optimize(true), p_log_stream(p_log), + p_log_printer(0) +{ +} + +Compiler::~Compiler() +{ + +} + +int Compiler::compile(const std::string &options, + llvm::MemoryBuffer *source) +{ + /* Set options */ + p_options = options; + + clang::CodeGenOptions &codegen_opts = p_compiler.getCodeGenOpts(); + clang::DiagnosticOptions &diag_opts = p_compiler.getDiagnosticOpts(); + clang::FrontendOptions &frontend_opts = p_compiler.getFrontendOpts(); + clang::HeaderSearchOptions &header_opts = p_compiler.getHeaderSearchOpts(); + clang::LangOptions &lang_opts = p_compiler.getLangOpts(); + clang::TargetOptions &target_opts = p_compiler.getTargetOpts(); + clang::PreprocessorOptions &prep_opts = p_compiler.getPreprocessorOpts(); + clang::CompilerInvocation &invocation = p_compiler.getInvocation(); + + // Set codegen options + codegen_opts.setDebugInfo(clang::CodeGenOptions::NoDebugInfo); + codegen_opts.AsmVerbose = true; + codegen_opts.CodeModel = "default"; + + // level 3 is too much for the pocl transformations. + codegen_opts.OptimizationLevel = 2; + + // Set diagnostic options + diag_opts.Pedantic = true; + diag_opts.ShowColumn = true; + diag_opts.ShowLocation = true; + diag_opts.ShowCarets = false; + diag_opts.ShowFixits = true; + diag_opts.ShowColors = false; + diag_opts.ErrorLimit = 19; + diag_opts.MessageLength = 0; + + // Set frontend options + frontend_opts.ProgramAction = clang::frontend::EmitLLVMOnly; + frontend_opts.DisableFree = true; + + // Set header search options + header_opts.Verbose = false; + header_opts.UseBuiltinIncludes = false; + header_opts.UseStandardSystemIncludes = false; + header_opts.UseStandardCXXIncludes = false; + + // Set preprocessor options + prep_opts.RetainRemappedFileBuffers = true; + //prep_opts.ImplicitPCHInclude = "/usr/share/ti/opencl/clc.h"; + prep_opts.Includes.push_back("clc.h"); + prep_opts.Includes.push_back(p_device->builtinsHeader()); + + // Set lang options + lang_opts.NoBuiltin = true; + lang_opts.OpenCL = true; + lang_opts.CPlusPlus = false; + + // Set target options + cl_device_type devtype; + p_device->info(CL_DEVICE_TYPE, sizeof(devtype), &devtype, 0); + + if (devtype == CL_DEVICE_TYPE_CPU) { + // Originally: target_opts.Triple = llvm::sys::getHostTriple(); + target_opts.Triple = llvm::sys::getDefaultTargetTriple(); + } + else // devtype != CL_DEVICE_TYPE_CPU + { + // For 6X, use the 'spir' target, since it implements opencl specs + target_opts.Triple = "spir-unknown-unknown-unknown"; + + // Currently, llp6x does not handle fused multiply and add + // llvm intrinsics (llvm.fmuladd.*). Disable generating these + // intrinsics using clang -ffp-contract=off option + codegen_opts.setFPContractMode(clang::CodeGenOptions::FPC_Off); + } + + // Parse the user options + std::istringstream options_stream(options); + std::string token; + bool Werror = false, inI = false, inD = false; + +#ifndef SHAMROCK_BUILD + // Add opencl-headers' package default install include path as location to search + std::string header_path(get_ocl_dsp()); +#else // TODO: /usr/include/CL is where opencl headers go, but use ENV vars? + std::string header_path("/usr/include/CL"); +#endif + header_opts.AddPath(header_path, clang::frontend::Angled, false, false); + + + while (options_stream >> token) + { + if (inI) + { + // token is an include path + header_opts.AddPath(token, clang::frontend::Angled, false, false); + inI = false; + continue; + } + else if (inD) + { + // token is name or name=value + prep_opts.addMacroDef(token); + inD = false; + continue; + } + + //Handle -I xxx or -Ixxx. Assuming no other -I option prefix + if (token == "-I") + { + inI = true; + } + else if (token.compare(0,2,"-I") == 0) + { + header_opts.AddPath(token.substr(2), clang::frontend::Angled, false, + false); + } + //Handle -D xxx or -Dxxx. Assuming no other -D option prefix + else if (token == "-D") + { + inD = true; + } + else if (token.compare(0,2,"-D") == 0) //Handle -Dxxx (no space between) + { + prep_opts.addMacroDef(token.substr(2)); + } + else if (token == "-cl-single-precision-constant") + { + lang_opts.SinglePrecisionConstants = true; + } + else if (token == "-cl-opt-disable") + { + p_optimize = false; + codegen_opts.OptimizationLevel = 0; + } + else if (token == "-cl-mad-enable") + { + codegen_opts.LessPreciseFPMAD = true; + } + else if (token == "-cl-unsafe-math-optimizations") + { + codegen_opts.UnsafeFPMath = true; + } + else if (token == "-cl-finite-math-only") + { + codegen_opts.NoInfsFPMath = true; + codegen_opts.NoNaNsFPMath = true; + } + else if (token == "-cl-fast-relaxed-math") + { + codegen_opts.UnsafeFPMath = true; + codegen_opts.NoInfsFPMath = true; + codegen_opts.NoNaNsFPMath = true; + lang_opts.FastRelaxedMath = true; + } + else if (token == "-w") + { + diag_opts.IgnoreWarnings = true; + } + else if (token == "-Werror") + { + Werror = true; + } + else if (token == "-cl-std=CL1.1") + { + } + else + { + return CL_INVALID_BUILD_OPTIONS; + } + } + + add_macrodefs_for_supported_opencl_extensions(prep_opts); + + // Set invocation options + //invocation.setLangDefaults(lang_opts,clang::IK_OpenCL); + invocation.setLangDefaults(lang_opts,clang::IK_OpenCL, clang::LangStandard::lang_opencl12); + + // Create the diagnostics engine + p_log_printer = new clang::TextDiagnosticPrinter(p_log_stream, &diag_opts); + p_compiler.createDiagnostics(p_log_printer); + + if (!p_compiler.hasDiagnostics()) + return false; + + p_compiler.getDiagnostics().setWarningsAsErrors(Werror); + + // Feed the compiler with source + frontend_opts.Inputs.push_back(clang::FrontendInputFile("program.cl", clang::IK_OpenCL)); + + //ASW TODO cleanup +#if 0 + prep_opts.addRemappedFile("program.cl", source); +#else + + const llvm::StringRef s_data(source->getBuffer()); + const llvm::StringRef s_name("<source>"); + llvm::MemoryBuffer *buffer = + llvm::MemoryBuffer::getMemBuffer(s_data, s_name); + + prep_opts.addRemappedFile("program.cl", buffer); +#endif + + //timespec t0, t1; + //clock_gettime(CLOCK_MONOTONIC, &t0); + // Compile + + clang::CodeGenAction *Act = new clang::EmitLLVMOnlyAction(&llvm::getGlobalContext()); + if (!p_compiler.ExecuteAction(*Act)) + { + // DEBUG + std::cout << log() << std::endl; + return true; + } + + //clock_gettime(CLOCK_MONOTONIC, &t1); + //printf("clang time: %6.4f secs\n", + //(float)t1.tv_sec-t0.tv_sec+(t1.tv_nsec-t0.tv_nsec)/1e9); + + p_log_stream.flush(); + p_module = Act->takeModule(); + + // uncomment to debug the llvm IR + // p_module->dump(); + + return false; +} + +// Query the device to get list of supported OpenCL extensions. Standard +// requires that each supported extension has a macro definition with the +// same name as the extension +void Compiler::add_macrodefs_for_supported_opencl_extensions + (clang::PreprocessorOptions &prep_opts) +{ + // Get the extensions string for the device + size_t size; + p_device->info(CL_DEVICE_EXTENSIONS, 0, NULL, &size); + + char *extensions = new char[size + 1]; + memset( extensions, CHAR_MIN, sizeof(char)*(size+1) ); + + p_device->info(CL_DEVICE_EXTENSIONS, sizeof(char)*size, extensions, NULL); + + // Create macro definitions from the extension names + std::istringstream extensions_stream(extensions); + std::string token; + + while (extensions_stream >> token) + prep_opts.addMacroDef(token); + + delete [] extensions; +} + +const std::string &Compiler::log() const +{ + return p_log; +} + +const std::string &Compiler::options() const +{ + return p_options; +} + +bool Compiler::optimize() const +{ + return p_optimize; +} + +llvm::Module *Compiler::module() const +{ + return p_module; +} + +void Compiler::appendLog(const std::string &log) +{ + p_log += log; +} diff --git a/src/core/compiler.h b/src/core/compiler.h new file mode 100644 index 0000000..58788e6 --- /dev/null +++ b/src/core/compiler.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file compiler.h + * \brief Compiler wrapped around Clang + */ + +#ifndef __COMPILER_H__ +#define __COMPILER_H__ + +#include <string> + +#include <clang/Frontend/CompilerInstance.h> +#include <llvm/Support/raw_ostream.h> + +namespace llvm +{ + class MemoryBuffer; + class Module; +} + +namespace clang +{ + class TextDiagnosticPrinter; +} + +namespace Coal +{ + +class DeviceInterface; + +/** + * \brief Compiler using Clang + * + * This class builds a Clang instance, runs it and then retains compilation logs + * and produced data. + */ +class Compiler +{ + public: + /** + * \brief Constructor + * \param device \c Coal::DeviceInterface for which code will be compiled + */ + Compiler(DeviceInterface *device); + ~Compiler(); + + /** + * \brief Compile \p source to produce a LLVM module + * \param options options given to the compiler, described in the OpenCL spec + * \param source source to be compiled + * \return true if the compilation is successful, false otherwise + * 2 if illegal options + * \sa module() + * \sa log() + */ + int compile(const std::string &options, llvm::MemoryBuffer *source); + + /** + * \brief Compilation log + * \note \c appendLog() can also be used to append custom info at the end + * of the log, for instance to keep compilation and linking logs + * in the same place + * \return log + */ + const std::string &log() const; + + /** + * \brief Options given at \c compile() + * \return options used during compilation + */ + const std::string &options() const; + + /** + * \brief Optimization enabled + * \return true if -cl-opt-disable was given in the options, false otherwise + */ + bool optimize() const; + + /** + * \brief LLVM module generated + * \return LLVM module generated by the compilation, 0 if an error occured + */ + llvm::Module *module() const; + + /** + * \brief Append a string to the log + * + * This function can be used to append linking or code-gen logs to the + * internal compilation log kept by this class + * + * \param log log to be appended + */ + void appendLog(const std::string &log); + + private: + DeviceInterface *p_device; + clang::CompilerInstance p_compiler; + llvm::Module *p_module; + bool p_optimize; + + std::string p_log, p_options; + llvm::raw_string_ostream p_log_stream; + clang::TextDiagnosticPrinter *p_log_printer; + + void add_macrodefs_for_supported_opencl_extensions + (clang::PreprocessorOptions &prep_opts); + +}; + +} + +#endif diff --git a/src/core/config.h b/src/core/config.h new file mode 100644 index 0000000..e1e401b --- /dev/null +++ b/src/core/config.h @@ -0,0 +1,9 @@ +#ifndef __CONFIG_H__ +#define __CONFIG_H__ + +#define LLVM_VERSION "3.5.0svn" +#define COAL_VERSION "" + +#define MAX_WORK_DIMS 3 + +#endif diff --git a/src/core/config.h.cmake b/src/core/config.h.cmake new file mode 100644 index 0000000..ccf87b7 --- /dev/null +++ b/src/core/config.h.cmake @@ -0,0 +1,9 @@ +#ifndef __CONFIG_H__ +#define __CONFIG_H__ + +#define LLVM_VERSION "@LLVM_VERSION@" +#define COAL_VERSION "@Coal_VERSION@" + +#define MAX_WORK_DIMS 3 + +#endif diff --git a/src/core/context.cpp b/src/core/context.cpp new file mode 100644 index 0000000..e9129ff --- /dev/null +++ b/src/core/context.cpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file context.cpp + * \brief Context + */ + +#include "context.h" +#include "deviceinterface.h" +#include "propertylist.h" +#include "platform.h" + +#include <cstring> +#include <cstdlib> + +#include <llvm/Support/TargetSelect.h> + +using namespace Coal; + +static void default_pfn_notify(const char *, const void *, size_t, void *) +{ + return; +} + +Context::Context(const cl_context_properties *properties, + cl_uint num_devices, + const cl_device_id *devices, + void (CL_CALLBACK *pfn_notify)(const char *, const void *, + size_t, void *), + void *user_data, + cl_int *errcode_ret) +: Object(Object::T_Context, 0), p_properties(0), p_pfn_notify(pfn_notify), + p_user_data(user_data), p_devices(0), p_num_devices(0), p_props_len(0), + p_platform(&the_platform) +{ + if (!p_pfn_notify) + p_pfn_notify = &default_pfn_notify; + + // Intialize LLVM, this can be done more than one time per program + llvm::InitializeNativeTarget(); + llvm::InitializeNativeTargetAsmPrinter(); + llvm::InitializeNativeTargetAsmParser(); + + // Explore the properties + if (properties) + { + const unsigned char *props = (const unsigned char *)properties; + cl_context_properties prop; + size_t props_len = 0; + +#define GET_PROP(type, var) \ + var = *(const type *)props; \ + props += sizeof(type); \ + props_len += sizeof(type); + + int propset = 0; + while (true) + { + GET_PROP(cl_context_properties, prop) + + if (!prop) + break; + + switch (prop) + { + case CL_CONTEXT_PLATFORM: + if (!propset) + { + GET_PROP(cl_platform_id, p_platform); + propset = 1; + } + else + { + *errcode_ret = CL_INVALID_PROPERTY; + return; + } + break; + + default: + *errcode_ret = CL_INVALID_PROPERTY; + return; + } + } + + // properties may be allocated on the stack of the client application + // copy it into a real buffer + p_properties = (cl_context_properties *)std::malloc(props_len); + p_props_len = props_len; + + if (!p_properties) + { + *errcode_ret = CL_OUT_OF_HOST_MEMORY; + return; + } + + std::memcpy((void *)p_properties, (const void *)properties, props_len); + } + + // Verify that the platform is good + if (p_platform != &the_platform) + { + *errcode_ret = CL_INVALID_PLATFORM; + return; + } + + // Explore the devices + p_devices = (DeviceInterface **)std::malloc(num_devices * sizeof(DeviceInterface *)); + p_num_devices = num_devices; + + if (!p_devices) + { + *errcode_ret = CL_OUT_OF_HOST_MEMORY; + return; + } + + for (cl_uint i=0; i<num_devices; ++i) + { + cl_device_id device = devices[i]; + + if (device == 0) + { + *errcode_ret = CL_INVALID_DEVICE; + return; + } + + // Verify that the device is available + cl_bool device_available; + + *errcode_ret = device->info(CL_DEVICE_AVAILABLE, + sizeof(device_available), + &device_available, + 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (!device_available) + { + *errcode_ret = CL_DEVICE_NOT_AVAILABLE; + return; + } + + // Add the device to the list + p_devices[i] = (DeviceInterface *)device; + } +} + +Context::~Context() +{ + if (p_properties) + std::free((void *)p_properties); + + if (p_devices) + std::free((void *)p_devices); +} + +cl_int Context::info(cl_context_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union { + cl_uint cl_uint_var; + }; + + switch (param_name) + { + case CL_CONTEXT_REFERENCE_COUNT: + SIMPLE_ASSIGN(cl_uint, references()); + break; + + case CL_CONTEXT_NUM_DEVICES: + SIMPLE_ASSIGN(cl_uint, p_num_devices); + break; + + case CL_CONTEXT_DEVICES: + MEM_ASSIGN(p_num_devices * sizeof(DeviceInterface *), p_devices); + break; + + case CL_CONTEXT_PROPERTIES: + MEM_ASSIGN(p_props_len, p_properties); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value && value_length /* CONTEXT_PROPERTIES can be of length 0 */) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +bool Context::hasDevice(DeviceInterface *device) const +{ + for (unsigned int i=0; i<p_num_devices; ++i) + if (p_devices[i] == device) + return true; + + return false; +} diff --git a/src/core/context.h b/src/core/context.h new file mode 100644 index 0000000..4712d25 --- /dev/null +++ b/src/core/context.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file context.h + * \brief OpenCL context + */ + +#ifndef __CONTEXT_H__ +#define __CONTEXT_H__ + +#include "object.h" + +#include <CL/cl.h> + +namespace Coal +{ + +class DeviceInterface; + +/** + * \brief OpenCL context + * + * This class is the root of all OpenCL objects, except \c Coal::DeviceInterface. + */ +class Context : public Object +{ + public: + /** + * \brief Constructor + * \param properties properties of the context + * \param num_devices number of devices that will be used + * \param devices \c Coal::DeviceInterface to be used + * \param pfn_notify function to call when an error arises, to give + * more detail + * \param user_data user data to pass to \p pfn_notify + * \param errcode_ret return code + */ + Context(const cl_context_properties *properties, + cl_uint num_devices, + const cl_device_id *devices, + void (CL_CALLBACK *pfn_notify)(const char *, const void *, + size_t, void *), + void *user_data, + cl_int *errcode_ret); + ~Context(); + + /** + * \brief Info about the context + * \copydetails Coal::DeviceInterface::info + */ + cl_int info(cl_context_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + /** + * \brief Check that this context contains a given \p device + * \param device device to check + * \return whether this context contains \p device + */ + bool hasDevice(DeviceInterface *device) const; + + private: + cl_context_properties *p_properties; + void (CL_CALLBACK *p_pfn_notify)(const char *, const void *, + size_t, void *); + void *p_user_data; + + DeviceInterface **p_devices; + unsigned int p_num_devices, p_props_len; + cl_platform_id p_platform; +}; + +} + +struct _cl_context : public Coal::Context +{}; + +#endif diff --git a/src/core/cpu/buffer.cpp b/src/core/cpu/buffer.cpp new file mode 100644 index 0000000..9125872 --- /dev/null +++ b/src/core/cpu/buffer.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/buffer.cpp + * \brief CPU buffer + */ + +#include "buffer.h" +#include "device.h" + +#include "../memobject.h" + +#include <cstdlib> +#include <cstring> +#include <iostream> + +using namespace Coal; + +CPUBuffer::CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs) +: DeviceBuffer(), p_device(device), p_buffer(buffer), p_data(0), + p_data_malloced(false) +{ + if (buffer->type() == MemObject::SubBuffer) + { + // We need to create this CPUBuffer based on the CPUBuffer of the + // parent buffer + SubBuffer *subbuf = (SubBuffer *)buffer; + MemObject *parent = subbuf->parent(); + CPUBuffer *parentcpubuf = (CPUBuffer *)parent->deviceBuffer(device); + + char *tmp_data = (char *)parentcpubuf->data(); + tmp_data += subbuf->offset(); + + p_data = (void *)tmp_data; + } + else if (buffer->flags() & CL_MEM_USE_HOST_PTR) + { + // We use the host ptr, we are already allocated + p_data = buffer->host_ptr(); + } + + // NOTE: This function can also reject Image buffers by setting a value + // != CL_SUCCESS in rs. +} + +CPUBuffer::~CPUBuffer() +{ + if (p_data_malloced) + { + std::free((void *)p_data); + } +} + +void *CPUBuffer::data() const +{ + return p_data; +} + +void *CPUBuffer::nativeGlobalPointer() const +{ + return data(); +} + +bool CPUBuffer::allocate() +{ + size_t buf_size = p_buffer->size(); + + if (buf_size == 0) + // Something went wrong... + return false; + + if (!p_data) + { + // We don't use a host ptr, we need to allocate a buffer + p_data = std::malloc(buf_size); + + if (!p_data) + return false; + + p_data_malloced = true; + } + + if (p_buffer->type() != MemObject::SubBuffer && + p_buffer->flags() & CL_MEM_COPY_HOST_PTR) + { + std::memcpy(p_data, p_buffer->host_ptr(), buf_size); + } + + // Say to the memobject that we are allocated + p_buffer->deviceAllocated(this); + + return true; +} + +DeviceInterface *CPUBuffer::device() const +{ + return p_device; +} + +bool CPUBuffer::allocated() const +{ + return p_data != 0; +} diff --git a/src/core/cpu/buffer.h b/src/core/cpu/buffer.h new file mode 100644 index 0000000..d88c9e5 --- /dev/null +++ b/src/core/cpu/buffer.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file buffer.h + * \brief CPU buffer + */ + +#ifndef __CPU_BUFFER_H__ +#define __CPU_BUFFER_H__ + +#include "../deviceinterface.h" + +namespace Coal +{ + +class CPUDevice; +class MemObject; + +/** + * \brief CPU implementation of \c Coal::MemObject + * + * This class is responsible of the actual allocation of buffer objects, using + * \c malloc() or by reusing a given \c host_ptr. + */ +class CPUBuffer : public DeviceBuffer +{ + public: + /** + * \brief Constructor + * \param device Device for which the buffer is allocated + * \param buffer \c Coal::MemObject holding information about the buffer + * \param rs return code (\c CL_SUCCESS if all is good) + */ + CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs); + ~CPUBuffer(); + + bool allocate(); + DeviceInterface *device() const; + void *data() const; /*!< \brief Pointer to the buffer's data */ + void *nativeGlobalPointer() const; + bool allocated() const; + + private: + CPUDevice *p_device; + MemObject *p_buffer; + void *p_data; + bool p_data_malloced; +}; + +} + +#endif diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp new file mode 100644 index 0000000..137d34e --- /dev/null +++ b/src/core/cpu/builtins.cpp @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/builtins.cpp + * \brief Native OpenCL C built-in functions + * + * All these built-ins are directly called by kernels. When the LLVM JIT + * sees a function name it doesn't know, it calls \c getBuiltin() with this + * name as parameter. This function then returns the address of an actual + * function implementation, that finally gets called by the kernel when + * it is run. + */ + +#include "builtins.h" +#include "kernel.h" +#include "buffer.h" + +#include "../events.h" +#include "../memobject.h" + +#include <sys/mman.h> +#include <signal.h> + +#include <llvm/IR/Function.h> + +#include <iostream> +#include <cstring> +#include <cmath> +#include <boost/math/special_functions.hpp> + +#include <stdio.h> + +using namespace Coal; + +unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z, + size_t row_pitch, size_t slice_pitch, + unsigned int bytes_per_pixel) +{ + unsigned char *result = base; + + result += (z * slice_pitch) + + (y * row_pitch) + + (x * bytes_per_pixel); + + return result; +} + +/* + * TLS-related functions + */ +__thread Coal::CPUKernelWorkGroup *g_work_group; /*!< \brief \c Coal::CPUKernelWorkGroup currently running on this thread */ +__thread void *work_items_data; /*!< \brief Space allocated for work-items stacks, see \ref barrier */ +__thread size_t work_items_size; /*!< \brief Size of \c work_items_data, see \ref barrier */ + +void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current) +{ + g_work_group = current; +} + +void *getWorkItemsData(size_t &size) +{ + size = work_items_size; + return work_items_data; +} + +void setWorkItemsData(void *ptr, size_t size) +{ + work_items_data = ptr; + work_items_size = size; +} + +/* + * Actual built-ins implementations + */ +cl_uint CPUKernelWorkGroup::getWorkDim() const +{ + return p_work_dim; +} + +size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx]; +} + +size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const +{ + if (dimindx >p_work_dim) + return 1; + + return p_event->global_work_size(dimindx); +} + +size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 1; + + return p_event->local_work_size(dimindx); +} + +size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_current_context->local_id[dimindx]; +} + +size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 1; + + return (p_event->global_work_size(dimindx) / + p_event->local_work_size(dimindx)); +} + +size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_index[dimindx]; +} + +size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_event->global_work_offset(dimindx); +} + +void CPUKernelWorkGroup::barrier(unsigned int flags) +{ + p_had_barrier = true; + + // Allocate or reuse TLS memory for the stacks (it isn't freed between + // the work groups, and even the kernels, so if we need less space than + // allocated, it's good) + if (!p_contexts) + { + if (p_current_work_item != 0) + { + // Completely abnormal, it means that not every work-items + // encounter the barrier + std::cerr << "*** Not every work-items of " + << p_kernel->function()->getName().str() + << " calls barrier(); !" << std::endl; + return; + } + + // Allocate or reuse the stacks + size_t contexts_size; + p_contexts = getWorkItemsData(contexts_size); + size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context)); + + if (!p_contexts || contexts_size < needed_size) + { + // We must allocate a new space + if (p_contexts) + munmap(p_contexts, contexts_size); + + p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */ + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + setWorkItemsData(p_contexts, contexts_size); + } + + // Now that we have a real main context, initialize it + p_current_context = getContextAddr(0); + p_current_context->initialized = 1; + std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t)); + + getcontext(&p_current_context->context); + } + + // Take the next context + p_current_work_item++; + if (p_current_work_item == p_num_work_items) p_current_work_item = 0; + + Context *next = getContextAddr(p_current_work_item); + Context *main = getContextAddr(0); // The context not created with makecontext + + // If the next context isn't initialized, initialize it. + // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized + if (next->initialized == 0) + { + next->initialized = 1; + + // local-id of next is the one of the current context, but incVec'ed + std::memcpy(next->local_id, p_current_context->local_id, + MAX_WORK_DIMS * sizeof(size_t)); + + incVec(p_work_dim, next->local_id, p_max_local_id); + + // Initialize the next context + if (getcontext(&next->context) != 0) + return; + + // Get its stack. It is located a next + sizeof(Context) + char *stack = (char *)next; + stack += sizeof(Context); + + next->context.uc_link = &main->context; + next->context.uc_stack.ss_sp = stack; + next->context.uc_stack.ss_size = p_stack_size; + + // Tell it to run the kernel function + makecontext(&next->context, (void (*)())p_kernel_func_addr, 1, p_args); + } + + // Switch to the next context + ucontext_t *cur = &p_current_context->context; + p_current_context = next; + + swapcontext(cur, &next->context); + + // When we return here, it means that all the other work items encountered + // a barrier and that we returned to this one. We can continue. +} + +void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const +{ + std::cout << "OpenCL: Non-existant builtin function " << name << std::endl; + std::cout << " found in " << p_kernel->function()->getName().str() + << '.' << std::endl; +} + +/* + * Built-in functions + */ + +static size_t get_global_id(cl_uint dimindx) +{ + return g_work_group->getGlobalId(dimindx); +} + +static cl_uint get_work_dim() +{ + return g_work_group->getWorkDim(); +} + +static size_t get_global_size(uint dimindx) +{ + return g_work_group->getGlobalSize(dimindx); +} + +static size_t get_local_size(uint dimindx) +{ + return g_work_group->getLocalSize(dimindx); +} + +static size_t get_local_id(uint dimindx) +{ + return g_work_group->getLocalID(dimindx); +} + +static size_t get_num_groups(uint dimindx) +{ + return g_work_group->getNumGroups(dimindx); +} + +static size_t get_group_id(uint dimindx) +{ + return g_work_group->getGroupID(dimindx); +} + +static size_t get_global_offset(uint dimindx) +{ + return g_work_group->getGlobalOffset(dimindx); +} + +static void barrier(unsigned int flags) +{ + g_work_group->barrier(flags); +} + +// Images + +static int get_image_width(Image2D *image) +{ + return image->width(); +} + +static int get_image_height(Image2D *image) +{ + return image->height(); +} + +static int get_image_depth(Image3D *image) +{ + if (image->type() != MemObject::Image3D) + return 1; + + return image->depth(); +} + +static int get_image_channel_data_type(Image2D *image) +{ + return image->format().image_channel_data_type; +} + +static int get_image_channel_order(Image2D *image) +{ + return image->format().image_channel_order; +} + +static void *image_data(Image2D *image, int x, int y, int z, int *order, int *type) +{ + *order = image->format().image_channel_order; + *type = image->format().image_channel_data_type; + + return g_work_group->getImageData(image, x, y, z); +} + +static bool is_image_3d(Image3D *image) +{ + return (image->type() == MemObject::Image3D ? 1 : 0); +} + +static void write_imagef(Image2D *image, int x, int y, int z, float *color) +{ + g_work_group->writeImage(image, x, y, z, color); +} + +static void write_imagei(Image2D *image, int x, int y, int z, int32_t *color) +{ + g_work_group->writeImage(image, x, y, z, color); +} + +static void write_imageui(Image2D *image, int x, int y, int z, uint32_t *color) +{ + g_work_group->writeImage(image, x, y, z, color); +} + +static void read_imagefi(float *result, Image2D *image, int x, int y, int z, + int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +static void read_imageii(int32_t *result, Image2D *image, int x, int y, int z, + int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +static void read_imageuii(uint32_t *result, Image2D *image, int x, int y, int z, + int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +static void read_imageff(float *result, Image2D *image, float x, float y, + float z, int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +static void read_imageif(int32_t *result, Image2D *image, float x, float y, + float z, int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +static void read_imageuif(uint32_t *result, Image2D *image, float x, float y, + float z, int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +/* Dummy function to plug missing ARM ABI EH fxns: */ +static void dummy_fxn(void) +{ +} + + +/* + * Bridge between LLVM and us + */ +static void unimplemented_stub() +{ +} + +void *getBuiltin(const std::string &name) +{ + if (name == "get_global_id") + return (void *)&get_global_id; + else if (name == "get_work_dim") + return (void *)&get_work_dim; + else if (name == "get_global_size") + return (void *)&get_global_size; + else if (name == "get_local_size") + return (void *)&get_local_size; + else if (name == "get_local_id") + return (void *)&get_local_id; + else if (name == "get_num_groups") + return (void *)&get_num_groups; + else if (name == "get_group_id") + return (void *)&get_group_id; + else if (name == "get_global_offset") + return (void *)&get_global_offset; + else if (name == "barrier") + return (void *)&barrier; + + else if (name == "__cpu_get_image_width") + return (void *)&get_image_width; + else if (name == "__cpu_get_image_height") + return (void *)&get_image_height; + else if (name == "__cpu_get_image_depth") + return (void *)&get_image_depth; + else if (name == "__cpu_get_image_channel_data_type") + return (void *)&get_image_channel_data_type; + else if (name == "__cpu_get_image_channel_order") + return (void *)&get_image_channel_order; + else if (name == "__cpu_image_data") + return (void *)&image_data; + else if (name == "__cpu_is_image_3d") + return (void *)&is_image_3d; + else if (name == "__cpu_write_imagef") + return (void *)&write_imagef; + else if (name == "__cpu_write_imagei") + return (void *)&write_imagei; + else if (name == "__cpu_write_imageui") + return (void *)&write_imageui; + else if (name == "__cpu_read_imagefi") + return (void *)&read_imagefi; + else if (name == "__cpu_read_imageii") + return (void *)&read_imageii; + else if (name == "__cpu_read_imageuii") + return (void *)&read_imageuii; + else if (name == "__cpu_read_imageff") + return (void *)&read_imageff; + else if (name == "__cpu_read_imageif") + return (void *)&read_imageif; + else if (name == "__cpu_read_imageuif") + return (void *)&read_imageuif; + + else if (name == "debug") + return (void *)&printf; + else if (name == "__aeabi_unwind_cpp_pr0") + return (void *)&dummy_fxn; + else if (name == "__aeabi_unwind_cpp_pr1") + return (void *)&dummy_fxn; + else if (name == "__aeabi_unwind_cpp_pr2") + return (void *)&dummy_fxn; + + // Math library disambiguation for OpenCL double functions of the same name. + else if (name == "builtin_sincos") + return (void *)&sincos; + else if (name == "builtin_lgamma_r") + return (void *)&lgamma_r; + else if (name == "builtin_modf") + return (void *)&modf; + else if (name == "builtin_remquo") + return (void *)&remquo; + else if (name == "builtin_pow") + return (void *)&pow; + else if (name == "builtin_exp10f") + return (void *)&exp10f; + else if (name == "builtin_exp10") + return (void *)&exp10; + +#if 0 + // Other misc functions Khronos tests say are builtins, though not in the spec! + else if (name == "memcpy") + return (void *)&memcpy; +#endif + + // Function not found + g_work_group->builtinNotFound(name); + + return (void *)&unimplemented_stub; +} diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h new file mode 100644 index 0000000..69143ea --- /dev/null +++ b/src/core/cpu/builtins.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file builtins.h + * \brief CPU built-in functions + */ +#ifndef __BUILTINS_H__ +#define __BUILTINS_H__ + +#include <string> + +namespace Coal { + class CPUKernelWorkGroup; +} + +/** + * \brief Set the current kernel work-group of this thread + * \param current \c Coal::CPUKernelWorkGroup to be set in \c g_work_group. + */ +void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current); + +/** + * \brief Return the address of a built-in function given its name + * \param name name of the built-in whose address is requested + */ +void *getBuiltin(const std::string &name); + +/** + * \brief Work-item stacks + * \see \ref barrier + * \param size size of the allocated space for stacks + * \return address of the allocated space for stacks + */ +void *getWorkItemsData(size_t &size); + +/** + * \brief Set work-item stacks + * \see \ref barrier + * \param ptr address of allocated space for stacks + * \param size size of the allocated space for stacks + */ +void setWorkItemsData(void *ptr, size_t size); + +/** + * \brief Increment a n-component vector given a maximum value + * + * This function is used to increment a vector for which a set of maximum values + * each of its element can reach before the next is incremented. + * + * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and + * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the + * same vector will produce the following results : + * + * \code + * {0, 0, 1} + * {0, 1, 0} + * {0, 1, 1} + * {0, 2, 0} + * {0, 2, 1} + * {0, 3, 0} + * {0, 3, 1} + * {1, 0, 0} + * ... + * \endcode + * + * Until \p vec reaches <tt>{2, 3, 1}</tt>. + * + * \param dims number of elements in the vectors + * \param vec vector whose elements will be incremented + * \param maxs vector containing a maximum value above which each corresponding + * element of \p vec cannot go. + * \return false if the increment was ok, true if \p vec was already at it's + * maximum value and couldn't be further incremented. + */ +template<typename T> +bool incVec(unsigned long dims, T *vec, T *maxs) +{ + bool overflow = false; + + for (unsigned int i=0; i<dims; ++i) + { + vec[i] += 1; + + if (vec[i] > maxs[i]) + { + vec[i] = 0; + overflow = true; + } + else + { + overflow = false; + break; + } + } + + return overflow; +} + +/** + * \brief Address of a pixel in an image + * + * This function is heavily used when Clover needs to address a pixel or a byte + * in a rectangular or three-dimensional image or buffer. + * + * \param base address of the first pixel in the image (address of the image itself) + * \param x X coordinate, cannot be bigger or equal to \c width + * \param y Y coordinate, cannot be bigger or equal to \c height + * \param z Z coordinate, cannot be bigger or equal to \c depth (1 for 2D arrays) + * \param row_pitch size in bytes of a row of pixels in the image + * \param slice_pitch size in bytes of a slice in a 3D array + * \param bytes_per_pixel bytes per pixel (1 for simple buffers), used when + * coordinates are in pixels and not in bytes. + */ +unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z, + size_t row_pitch, size_t slice_pitch, + unsigned int bytes_per_pixel); + +#endif + diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp new file mode 100644 index 0000000..eb3fcb1 --- /dev/null +++ b/src/core/cpu/device.cpp @@ -0,0 +1,675 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/device.cpp + * \brief CPU Device + */ + +#include "device.h" +#include "buffer.h" +#include "kernel.h" +#include "program.h" +#include "worker.h" +#include "builtins.h" + +#include <core/config.h> +#include "../propertylist.h" +#include "../commandqueue.h" +#include "../events.h" +#include "../memobject.h" +#include "../kernel.h" +#include "../program.h" +#include "../util.h" + +#include <cstring> +#include <cstdlib> +#include <unistd.h> + +#include <iostream> +#include <fstream> +#include <sstream> + +using namespace Coal; + +#if !(defined(DSPC868X) || defined(SHAMROCK_BUILD)) +#include "../dsp/shmem.h" +// unsigned arm_speed(); +#endif + +#define ONE_GIGABYTE (1 << 30) + +CPUDevice::CPUDevice() +: DeviceInterface(), p_cores(0), p_num_events(0), p_workers(0), p_stop(false), + p_initialized(false) +{ + // Get info about the system + p_cores = sysconf(_SC_NPROCESSORS_ONLN); + p_cpu_mhz = 0.0f; + + std::filebuf fb; + fb.open("/proc/cpuinfo", std::ios::in); + std::istream is(&fb); + + while (!is.eof()) + { + std::string key, value; + + std::getline(is, key, ':'); + is.ignore(1); + std::getline(is, value); + + if (key.compare(0, 7, "cpu MHz") == 0) + { + std::istringstream ss(value); + ss >> p_cpu_mhz; + } + + if (key.compare(0, 10, "model name") == 0) + p_device_name = value; + + if (key.compare(0, 9, "Processor") == 0) + p_device_name = value; + } + + if (p_cpu_mhz == 0.0f) + { + std::string file("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"); + std::ifstream fs(file.c_str()); + if (fs) { fs >> p_cpu_mhz; p_cpu_mhz /= 1000; } + } + + if (p_cpu_mhz == 0.0f) p_cpu_mhz = 1000.0; + +#if !defined(DSPC868X) + // p_cpu_mhz = arm_speed(); +#endif +} + + +void CPUDevice::init() +{ + if (p_initialized) return; + + // Initialize the locking machinery + pthread_cond_init(&p_events_cond, 0); + pthread_mutex_init(&p_events_mutex, 0); + + // Create worker threads + p_workers = (pthread_t *)std::malloc(numCPUs() * sizeof(pthread_t)); + + for (unsigned int i=0; i<numCPUs(); ++i) + { + pthread_create(&p_workers[i], 0, &worker, this); + } + + p_initialized = true; +} + +CPUDevice::~CPUDevice() +{ + if (!p_initialized) + return; + + // Terminate the workers and wait for them + pthread_mutex_lock(&p_events_mutex); + + p_stop = true; + + pthread_cond_broadcast(&p_events_cond); + pthread_mutex_unlock(&p_events_mutex); + + for (unsigned int i=0; i<numCPUs(); ++i) + { + pthread_join(p_workers[i], 0); + } + + // Free allocated memory + std::free((void *)p_workers); + pthread_mutex_destroy(&p_events_mutex); + pthread_cond_destroy(&p_events_cond); +} + +DeviceBuffer *CPUDevice::createDeviceBuffer(MemObject *buffer, cl_int *rs) +{ + return (DeviceBuffer *)new CPUBuffer(this, buffer, rs); +} + +DeviceProgram *CPUDevice::createDeviceProgram(Program *program) +{ + return (DeviceProgram *)new CPUProgram(this, program); +} + +DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel, + llvm::Function *function) +{ + return (DeviceKernel *)new CPUKernel(this, kernel, function); +} + +cl_int CPUDevice::initEventDeviceData(Event *event) +{ + switch (event->type()) + { + case Event::MapBuffer: + { + MapBufferEvent *e = (MapBufferEvent *)event; + CPUBuffer *buf = (CPUBuffer *)e->buffer()->deviceBuffer(this); + unsigned char *data = (unsigned char *)buf->data(); + + data += e->offset(); + + e->setPtr((void *)data); + break; + } + case Event::MapImage: + { + MapImageEvent *e = (MapImageEvent *)event; + Image2D *image = (Image2D *)e->buffer(); + CPUBuffer *buf = (CPUBuffer *)image->deviceBuffer(this); + unsigned char *data = (unsigned char *)buf->data(); + + data = imageData(data, + e->origin(0), + e->origin(1), + e->origin(2), + image->row_pitch(), + image->slice_pitch(), + image->pixel_size()); + + e->setPtr((void *)data); + e->setRowPitch(image->row_pitch()); + e->setSlicePitch(image->slice_pitch()); + break; + } + case Event::UnmapMemObject: + // Nothing do to + break; + + case Event::NDRangeKernel: + case Event::TaskKernel: + { + // Instantiate the JIT for the CPU program + KernelEvent *e = (KernelEvent *)event; + Program *p = (Program *)e->kernel()->parent(); + CPUProgram *prog = (CPUProgram *)p->deviceDependentProgram(this); + + if (!prog->initJIT()) + return CL_INVALID_PROGRAM_EXECUTABLE; + + // Set device-specific data + CPUKernelEvent *cpu_e = new CPUKernelEvent(this, e); + e->setDeviceData((void *)cpu_e); + + break; + } + default: + break; + } + + return CL_SUCCESS; +} + +void CPUDevice::freeEventDeviceData(Event *event) +{ + switch (event->type()) + { + case Event::NDRangeKernel: + case Event::TaskKernel: + { + CPUKernelEvent *cpu_e = (CPUKernelEvent *)event->deviceData(); + + if (cpu_e) + delete cpu_e; + } + default: + break; + } +} + +void CPUDevice::pushEvent(Event *event) +{ + // Add an event in the list + pthread_mutex_lock(&p_events_mutex); + + p_events.push_back(event); + p_num_events++; // Way faster than STL list::size() ! + + pthread_cond_broadcast(&p_events_cond); + pthread_mutex_unlock(&p_events_mutex); +} + +Event *CPUDevice::getEvent(bool &stop) +{ + // Return the first event in the list, if any. Remove it if it is a + // single-shot event. + pthread_mutex_lock(&p_events_mutex); + + while (p_num_events == 0 && !p_stop) + pthread_cond_wait(&p_events_cond, &p_events_mutex); + + if (p_stop) + { + pthread_mutex_unlock(&p_events_mutex); + stop = true; + return 0; + } + + Event *event = p_events.front(); + + // If the run of this event will finish it, remove it from the list + bool last_slot = true; + + if (event->type() == Event::NDRangeKernel || + event->type() == Event::TaskKernel) + { + CPUKernelEvent *ke = (CPUKernelEvent *)event->deviceData(); + last_slot = ke->reserve(); + } + + if (last_slot) + { + p_num_events--; + p_events.pop_front(); + } + + pthread_mutex_unlock(&p_events_mutex); + + return event; +} + +/****************************************************************************** +* Device's decision about whether CommandQueue should push more events over +* This number could be tuned (e.g. using ooo example). Note that p_num_events +* are in device's queue, but not yet executed. +******************************************************************************/ +bool CPUDevice::gotEnoughToWorkOn() +{ + return p_num_events > 0; +} + +unsigned int CPUDevice::numCPUs() const +{ + return p_cores; +} + +float CPUDevice::cpuMhz() const +{ + return p_cpu_mhz; +} + +// From inner parentheses to outher ones : +// +// sizeof * 8 => 8 +// -1 => 7 +// 1 << $ => 10000000 +// -1 => 01111111 +// *2 => 11111110 +// +1 => 11111111 +// +// A simple way to do this is (1 << (sizeof(type) * 8)) - 1, but it overflows +// the type (for int8, 1 << $ = 100000000 = 256 > 255) +#define TYPE_MAX(type) ((((type)1 << ((sizeof(type) * 8) - 1)) - 1) * 2 + 1) + +cl_int CPUDevice::info(cl_device_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union { + cl_device_type cl_device_type_var; + cl_uint cl_uint_var; + size_t size_t_var; + cl_ulong cl_ulong_var; + cl_bool cl_bool_var; + cl_device_fp_config cl_device_fp_config_var; + cl_device_mem_cache_type cl_device_mem_cache_type_var; + cl_device_local_mem_type cl_device_local_mem_type_var; + cl_device_exec_capabilities cl_device_exec_capabilities_var; + cl_command_queue_properties cl_command_queue_properties_var; + cl_platform_id cl_platform_id_var; + size_t work_dims[MAX_WORK_DIMS]; + }; + + switch (param_name) + { + case CL_DEVICE_TYPE: + SIMPLE_ASSIGN(cl_device_type, CL_DEVICE_TYPE_CPU); + break; + + case CL_DEVICE_VENDOR_ID: + SIMPLE_ASSIGN(cl_uint, 0); + break; + + case CL_DEVICE_MAX_COMPUTE_UNITS: + SIMPLE_ASSIGN(cl_uint, numCPUs()); + break; + + case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: + SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS); + break; + + case CL_DEVICE_MAX_WORK_GROUP_SIZE: + SIMPLE_ASSIGN(size_t, ONE_GIGABYTE); + break; + + case CL_DEVICE_MAX_WORK_ITEM_SIZES: + for (int i=0; i<MAX_WORK_DIMS; ++i) + { + work_dims[i] = ONE_GIGABYTE; + } + value_length = MAX_WORK_DIMS * sizeof(size_t); + value = &work_dims; + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: + SIMPLE_ASSIGN(cl_uint, 16); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: + SIMPLE_ASSIGN(cl_uint, 8); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: + SIMPLE_ASSIGN(cl_uint, 4); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: + SIMPLE_ASSIGN(cl_uint, 4); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_MAX_CLOCK_FREQUENCY: + SIMPLE_ASSIGN(cl_uint, cpuMhz()); + break; + + case CL_DEVICE_ADDRESS_BITS: + SIMPLE_ASSIGN(cl_uint, 8*sizeof(void *)); + break; + + case CL_DEVICE_MAX_READ_IMAGE_ARGS: + SIMPLE_ASSIGN(cl_uint, 0); //images not supported + break; + + case CL_DEVICE_MAX_WRITE_IMAGE_ARGS: + SIMPLE_ASSIGN(cl_uint, 0); // images not supported + break; + + case CL_DEVICE_IMAGE2D_MAX_WIDTH: + SIMPLE_ASSIGN(size_t, 0); // images not supported + break; + + case CL_DEVICE_IMAGE2D_MAX_HEIGHT: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE3D_MAX_WIDTH: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE3D_MAX_HEIGHT: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE3D_MAX_DEPTH: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE_SUPPORT: + SIMPLE_ASSIGN(cl_bool, CL_FALSE); //images not supported + break; + + case CL_DEVICE_MAX_PARAMETER_SIZE: + SIMPLE_ASSIGN(size_t, 65536); + break; + + case CL_DEVICE_MAX_SAMPLERS: + SIMPLE_ASSIGN(cl_uint, 0); //images not supported + break; + + case CL_DEVICE_MEM_BASE_ADDR_ALIGN: + SIMPLE_ASSIGN(cl_uint, 1024 /* sizeof(long16)*8) */); // 128 byte + break; + + case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: + SIMPLE_ASSIGN(cl_uint, 16); + break; + + case CL_DEVICE_SINGLE_FP_CONFIG: + // TODO: Check what an x86 SSE engine can support. + // Currently not supporting CL_FP_DENORM + SIMPLE_ASSIGN(cl_device_fp_config, + CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST); + break; + + case CL_DEVICE_DOUBLE_FP_CONFIG: + // These are minimally required to be supported by the OCL spec: + SIMPLE_ASSIGN(cl_device_fp_config, + CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | + CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM); + break; + + case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: + SIMPLE_ASSIGN(cl_device_mem_cache_type, + CL_READ_WRITE_CACHE); + break; + + case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: + // TODO: Get this information from the processor + SIMPLE_ASSIGN(cl_uint, 16); + break; + + case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: + // TODO: Get this information from the processor + SIMPLE_ASSIGN(cl_ulong, 512*1024*1024); + break; + + case CL_DEVICE_GLOBAL_MEM_SIZE: + // parse /proc/meminfo to get the value + SIMPLE_ASSIGN(cl_ulong, parse_file_line_value("/proc/meminfo", + "MemTotal:", 512*1024) * 1024); + break; + + case CL_DEVICE_MAX_MEM_ALLOC_SIZE: + case CL_DEVICE_LOCAL_MEM_SIZE: + case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: + // TODO: 1 Gio seems to be enough for software acceleration + +#if defined(__arm__) + SIMPLE_ASSIGN(cl_ulong, 512*1024*1024); +#else + SIMPLE_ASSIGN(cl_ulong, 1*1024*1024*1024); +#endif + break; + + case CL_DEVICE_MAX_CONSTANT_ARGS: + SIMPLE_ASSIGN(cl_uint, 65536); + break; + + case CL_DEVICE_LOCAL_MEM_TYPE: + SIMPLE_ASSIGN(cl_device_local_mem_type, CL_GLOBAL); + break; + + + case CL_DEVICE_ERROR_CORRECTION_SUPPORT: + SIMPLE_ASSIGN(cl_bool, CL_FALSE); + break; + + case CL_DEVICE_PROFILING_TIMER_RESOLUTION: + // TODO + SIMPLE_ASSIGN(size_t, 1000); // 1000 nanoseconds = 1 ms + break; + + case CL_DEVICE_ENDIAN_LITTLE: + SIMPLE_ASSIGN(cl_bool, CL_TRUE); + break; + + case CL_DEVICE_AVAILABLE: + SIMPLE_ASSIGN(cl_bool, CL_TRUE); + break; + + case CL_DEVICE_COMPILER_AVAILABLE: + SIMPLE_ASSIGN(cl_bool, CL_TRUE); + break; + + case CL_DEVICE_EXECUTION_CAPABILITIES: + SIMPLE_ASSIGN(cl_device_exec_capabilities, CL_EXEC_KERNEL | + CL_EXEC_NATIVE_KERNEL); + break; + + case CL_DEVICE_QUEUE_PROPERTIES: + SIMPLE_ASSIGN(cl_command_queue_properties, + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_PROFILING_ENABLE); + break; + + case CL_DEVICE_NAME: + value_length = p_device_name.size() + 1; + value = const_cast<char*>(p_device_name.c_str()); + break; + + case CL_DEVICE_VENDOR: + STRING_ASSIGN("Generic"); + break; + + case CL_DRIVER_VERSION: + STRING_ASSIGN("" COAL_VERSION); + break; + + case CL_DEVICE_PROFILE: + STRING_ASSIGN("FULL_PROFILE"); + break; + + case CL_DEVICE_VERSION: + STRING_ASSIGN("OpenCL 1.1 " COAL_VERSION); + break; + + case CL_DEVICE_EXTENSIONS: + STRING_ASSIGN("cl_khr_global_int32_base_atomics" + " cl_khr_global_int32_extended_atomics" + " cl_khr_local_int32_base_atomics" + " cl_khr_local_int32_extended_atomics" + " cl_khr_byte_addressable_store" + + " cl_khr_fp64" + " cl_khr_int64_base_atomics" + " cl_khr_int64_extended_atomics") + + break; + + case CL_DEVICE_PLATFORM: + SIMPLE_ASSIGN(cl_platform_id, 0); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: + SIMPLE_ASSIGN(cl_uint, 0); + break; + + case CL_DEVICE_HOST_UNIFIED_MEMORY: + SIMPLE_ASSIGN(cl_bool, CL_TRUE); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR: + SIMPLE_ASSIGN(cl_uint, 16); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT: + SIMPLE_ASSIGN(cl_uint, 8); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: + SIMPLE_ASSIGN(cl_uint, 4); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: + SIMPLE_ASSIGN(cl_uint, 4); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF: + SIMPLE_ASSIGN(cl_uint, 0); + break; + + case CL_DEVICE_OPENCL_C_VERSION: + STRING_ASSIGN("OpenCL C 1.1 LLVM " LLVM_VERSION); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +#if !defined(DSPC868X) +#if 0 // /dev/mem is no longer available +unsigned arm_speed() +{ + //return 1000.0; + const unsigned TETRIS_PLL = 125000000; + const unsigned pagesize = 0x1000; + + shmem_persistent page; + page.configure(0x02620000, pagesize); + char *host_msmc = (char*)page.map(0x02620000, pagesize); + unsigned SECPLLCTL0 = *(unsigned*)(host_msmc + 0x370); + unsigned prediv = 1 + (SECPLLCTL0 & 0x3F); + unsigned mult = 1 + ((SECPLLCTL0 >> 6) & 0x1FFF); + unsigned output_div = 1 + ((SECPLLCTL0 >> 19) & 0xF); + unsigned speed = TETRIS_PLL * mult / prediv / output_div; + page.unmap(host_msmc, pagesize); + + return speed / 1000000; +} +#endif +#endif + diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h new file mode 100644 index 0000000..a0ad6ef --- /dev/null +++ b/src/core/cpu/device.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/device.h + * \brief CPU device + */ + +#ifndef __CPU_DEVICE_H__ +#define __CPU_DEVICE_H__ + +#include "../deviceinterface.h" + +#include <pthread.h> +#include <list> +#include <string> + +namespace Coal +{ + +class MemObject; +class Event; +class Program; +class Kernel; + +/** + * \brief CPU device + * + * This class is the base of all the CPU-accelerated OpenCL processing. It + * creates and manages subclasses such as \c Coal::DeviceBuffer, + * \c Coal::DeviceProgram and \c Coal::DeviceKernel. + * + * This class and the aforementioned ones work together to compile and run + * kernels using the LLVM JIT, manage buffers, provide built-in functions + * and do all of this in a multithreaded fashion using worker threads. + * + * \see \ref events + */ +class CPUDevice : public DeviceInterface +{ + public: + CPUDevice(); + ~CPUDevice(); + + /** + * \brief Initialize the CPU device + * + * This function creates the worker threads and get information about + * the host system for the \c numCPUs() and \c cpuMhz functions. + */ + void init(); + + cl_int info(cl_device_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs); + DeviceProgram *createDeviceProgram(Program *program); + DeviceKernel *createDeviceKernel(Kernel *kernel, + llvm::Function *function); + + cl_int initEventDeviceData(Event *event); + void freeEventDeviceData(Event *event); + + void pushEvent(Event *event); + Event *getEvent(bool &stop); + bool gotEnoughToWorkOn(); + + unsigned int numCPUs() const; /*!< \brief Number of logical CPU cores on the system */ + float cpuMhz() const; /*!< \brief Speed of the CPU in Mhz */ + + std::string builtinsHeader(void) const { return "cpu.h"; } + + private: + unsigned int p_cores, p_num_events; + float p_cpu_mhz; + std::string p_device_name; + pthread_t *p_workers; + + std::list<Event *> p_events; + pthread_cond_t p_events_cond; + pthread_mutex_t p_events_mutex; + bool p_stop, p_initialized; +}; + +} + +#endif diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp new file mode 100644 index 0000000..ef09f6b --- /dev/null +++ b/src/core/cpu/kernel.cpp @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/kernel.cpp + * \brief CPU kernel + */ + +#include "kernel.h" +#include "device.h" +#include "buffer.h" +#include "program.h" +#include "builtins.h" + +#include "../kernel.h" +#include "../memobject.h" +#include "../events.h" +#include "../program.h" + +#include <llvm/IR/Function.h> +#include <llvm/IR/Constants.h> +#include <llvm/IR/Instructions.h> +#include <llvm/IR/LLVMContext.h> +#include <llvm/IR/Module.h> +#include <llvm/ExecutionEngine/ExecutionEngine.h> + +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <sys/mman.h> + +using namespace Coal; + +CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function) +: DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function), + p_call_function(0) +{ + pthread_mutex_init(&p_call_function_mutex, 0); + + const char *fn_name; + + // If we can reuse the same function between work groups, do it +/* tag out for now if (p_call_function) + { + llvm::Function *rs = p_call_function; + pthread_mutex_unlock(&p_call_function_mutex); + + return rs; + } */ + + /* Create a stub function in the form of + * + * void stub(void *args) { + * kernel(*(int *)((char *)args + 0), + * *(float **)((char *)args + sizeof(int)), + * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *))); + * } + * + * In LLVM, it is exprimed in the form of : + * + * @stub(i8* args) { + * kernel( + * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)), + * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)), + * ... + * ); + * } + */ + fn_name = kernel->p_name.c_str(); + Program *p = (Program *)kernel->parent(); + CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(device)); + //llvm::Function *t_function = prog->jit()->FindFunctionNamed(fn_name); + + char * s_name = (char *) malloc(strlen(fn_name)+6); + sprintf(s_name,"_stub%s",fn_name); + + llvm::FunctionType *kernel_function_type = function->getFunctionType(); + llvm::FunctionType *stub_function_type = llvm::FunctionType::get( + function->getReturnType(), + llvm::Type::getInt8PtrTy( + function->getContext()), + false); + llvm::Function *stub_function = llvm::Function::Create( + stub_function_type, + llvm::Function::InternalLinkage, + s_name, + function->getParent()); + + // Insert a basic block + llvm::BasicBlock *basic_block = llvm::BasicBlock::Create( + function->getContext(), + "", + stub_function); + + // Create the function arguments + llvm::Argument &stub_arg = stub_function->getArgumentList().front(); + llvm::SmallVector<llvm::Value *, 8> args; + size_t args_offset = 0; + + for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i) + { + llvm::Type *param_type = kernel_function_type->getParamType(i); + llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value + const Kernel::Arg *arg = p_kernel->arg(i); + + // Calculate the size of the arg + size_t arg_size = arg->valueSize() * arg->vecDim(); + + // Get where to place this argument + size_t arg_offset = typeOffset(args_offset, arg_size); + + // %1 = getelementptr(args, $arg_offset); + llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds( + &stub_arg, + llvm::ConstantInt::get(stub_function->getContext(), + llvm::APInt(64, arg_offset)), + "", + basic_block); + + // %2 = bitcast(%1, $param_type_ptr) + llvm::Value *bitcast = new llvm::BitCastInst( + getelementptr, + param_type_ptr, + "", + basic_block); + + // %3 = load(%2) + llvm::Value *load = new llvm::LoadInst( + bitcast, + "", + false, + arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps + basic_block); + + // We have the value, send it to the function + args.push_back(load); + } + + // Create the call instruction + llvm::CallInst *call_inst = llvm::CallInst::Create( + function, + args, + "", + basic_block); + call_inst->setCallingConv(function->getCallingConv()); + call_inst->setTailCall(); + + // Create a return instruction to end the stub + llvm::ReturnInst::Create( + function->getContext(), + basic_block); + + // Retain the function if it can be reused + p_call_function = stub_function; + +} + +CPUKernel::~CPUKernel() +{ + if (p_call_function) + p_call_function->eraseFromParent(); + + pthread_mutex_destroy(&p_call_function_mutex); +} + +size_t CPUKernel::workGroupSize() +{ + // Just use CL_DEVICE_MAX_WORK_GROUP_SIZE + size_t param_value; + size_t param_value_size_ret; + + p_device->info(CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), + ¶m_value, ¶m_value_size_ret); + + return param_value; +} + +cl_ulong CPUKernel::localMemSize() const +{ + return 0; // TODO +} + +cl_ulong CPUKernel::privateMemSize() const +{ + return 0; // TODO +} + +size_t CPUKernel::preferredWorkGroupSizeMultiple() const +{ + unsigned int cpus = p_device->numCPUs(); + return cpus; +} + +template<typename T> +T k_exp(T base, unsigned int e) +{ + T rs = base; + + for (unsigned int i=1; i<e; ++i) + rs *= base; + + return rs; +} + +// Try to find the size a work group has to have to be executed the fastest on +// the CPU. +size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim, + size_t global_work_size) const +{ + unsigned int cpus = p_device->numCPUs(); + + // Don't break in too small parts + if (k_exp(global_work_size, num_dims) > 64) + return global_work_size; + + // Find the divisor of global_work_size the closest to cpus but >= than it + unsigned int divisor = cpus; + + while (true) + { + if ((global_work_size % divisor) == 0) + break; + + // Don't let the loop go up to global_work_size, the overhead would be + // too huge + if (divisor > global_work_size || divisor > cpus * 32) + { + divisor = 1; // Not parallel but has no CommandQueue overhead + break; + } + } + + // Return the size + return global_work_size / divisor; +} + +llvm::Function *CPUKernel::function() const +{ + return p_function; +} + +Kernel *CPUKernel::kernel() const +{ + return p_kernel; +} + +CPUDevice *CPUKernel::device() const +{ + return p_device; +} + +// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two +template <class T> +T next_power_of_two(T k) { + if (k == 0) + return 1; + k--; + for (int i=1; i<sizeof(T)*8; i<<=1) + k = k | k >> i; + return k+1; +} + +size_t CPUKernel::typeOffset(size_t &offset, size_t type_len) +{ + size_t rs = offset; + + // Align offset to stype_len + type_len = next_power_of_two(type_len); + size_t mask = ~(type_len - 1); + + while (rs & mask != rs) + rs++; + + // Where to try to place the next value + offset = rs + type_len; + + return rs; +} + +llvm::Function *CPUKernel::callFunction() +{ + const char *fn_name; + pthread_mutex_lock(&p_call_function_mutex); + + // If we can reuse the same function between work groups, do it + if (p_call_function) + { + llvm::Function *rs = p_call_function; + pthread_mutex_unlock(&p_call_function_mutex); + + return rs; + } + + /* Create a stub function in the form of + * + * void stub(void *args) { + * kernel(*(int *)((char *)args + 0), + * *(float **)((char *)args + sizeof(int)), + * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *))); + * } + * + * In LLVM, it is exprimed in the form of : + * + * @stub(i8* args) { + * kernel( + * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)), + * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)), + * ... + * ); + * } + */ + fn_name = kernel()->p_name.c_str(); + Program *p = (Program *)kernel()->parent(); + CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(device())); + llvm::Function *t_function = prog->jit()->FindFunctionNamed(fn_name); + + + llvm::FunctionType *kernel_function_type = p_function->getFunctionType(); + llvm::FunctionType *stub_function_type = llvm::FunctionType::get( + p_function->getReturnType(), + llvm::Type::getInt8PtrTy( + p_function->getContext()), + false); + llvm::Function *stub_function = llvm::Function::Create( + stub_function_type, + llvm::Function::InternalLinkage, + "stub", + p_function->getParent()); + + // Insert a basic block + llvm::BasicBlock *basic_block = llvm::BasicBlock::Create( + p_function->getContext(), + "", + stub_function); + + // Create the function arguments + llvm::Argument &stub_arg = stub_function->getArgumentList().front(); + llvm::SmallVector<llvm::Value *, 8> args; + size_t args_offset = 0; + + for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i) + { + llvm::Type *param_type = kernel_function_type->getParamType(i); + llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value + const Kernel::Arg *arg = p_kernel->arg(i); + + // Calculate the size of the arg + size_t arg_size = arg->valueSize() * arg->vecDim(); + + // Get where to place this argument + size_t arg_offset = typeOffset(args_offset, arg_size); + + // %1 = getelementptr(args, $arg_offset); + llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds( + &stub_arg, + llvm::ConstantInt::get(stub_function->getContext(), + llvm::APInt(64, arg_offset)), + "", + basic_block); + + // %2 = bitcast(%1, $param_type_ptr) + llvm::Value *bitcast = new llvm::BitCastInst( + getelementptr, + param_type_ptr, + "", + basic_block); + + // %3 = load(%2) + llvm::Value *load = new llvm::LoadInst( + bitcast, + "", + false, + arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps + basic_block); + + // We have the value, send it to the function + args.push_back(load); + } + + // Create the call instruction + llvm::CallInst *call_inst = llvm::CallInst::Create( + t_function, + args, + "", + basic_block); + call_inst->setCallingConv(p_function->getCallingConv()); + call_inst->setTailCall(); + + // Create a return instruction to end the stub + llvm::ReturnInst::Create( + p_function->getContext(), + basic_block); + + // Retain the function if it can be reused + p_call_function = stub_function; + + pthread_mutex_unlock(&p_call_function_mutex); + + return stub_function; +} + +/* + * CPUKernelEvent + */ +CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event) +: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0), + p_kernel_args(0) +{ + // Mutex + pthread_mutex_init(&p_mutex, 0); + + // Set current work group to (0, 0, ..., 0) + std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t)); + + // Populate p_max_work_groups + p_num_wg = 1; + + for (cl_uint i=0; i<event->work_dim(); ++i) + { + p_max_work_groups[i] = + (event->global_work_size(i) / event->local_work_size(i)) - 1; // 0..n-1, not 1..n + + p_num_wg *= p_max_work_groups[i] + 1; + } +} + +CPUKernelEvent::~CPUKernelEvent() +{ + pthread_mutex_destroy(&p_mutex); + + if (p_kernel_args) + std::free(p_kernel_args); +} + +bool CPUKernelEvent::reserve() +{ + // Lock, this will be unlocked in takeInstance() + pthread_mutex_lock(&p_mutex); + + // Last work group if current == max - 1 + return (p_current_wg == p_num_wg - 1); +} + +bool CPUKernelEvent::finished() +{ + bool rs; + + pthread_mutex_lock(&p_mutex); + + rs = (p_finished_wg == p_num_wg); + + pthread_mutex_unlock(&p_mutex); + + return rs; +} + +void CPUKernelEvent::workGroupFinished() +{ + pthread_mutex_lock(&p_mutex); + + p_finished_wg++; + + pthread_mutex_unlock(&p_mutex); +} + +CPUKernelWorkGroup *CPUKernelEvent::takeInstance() +{ + CPUKernelWorkGroup *wg = new CPUKernelWorkGroup((CPUKernel *)p_event->deviceKernel(), + p_event, + this, + p_current_work_group); + + // Increment current work group + incVec(p_event->work_dim(), p_current_work_group, p_max_work_groups); + p_current_wg += 1; + + // Release event + pthread_mutex_unlock(&p_mutex); + + return wg; +} + +void *CPUKernelEvent::kernelArgs() const +{ + return p_kernel_args; +} + +void CPUKernelEvent::cacheKernelArgs(void *args) +{ + p_kernel_args = args; +} + +/* + * CPUKernelWorkGroup + */ +CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, + CPUKernelEvent *cpu_event, + const size_t *work_group_index) +: p_kernel(kernel), p_cpu_event(cpu_event), p_event(event), + p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */), + p_had_barrier(false) +{ + + // Set index + std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t)); + + // Set maxs and global id + p_num_work_items = 1; + + for (unsigned int i=0; i<p_work_dim; ++i) + { + p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n + p_num_work_items *= event->local_work_size(i); + + // Set global id + p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i)) + + event->global_work_offset(i); + } +} + +CPUKernelWorkGroup::~CPUKernelWorkGroup() +{ + p_cpu_event->workGroupFinished(); +} + +void *CPUKernelWorkGroup::callArgs(std::vector<void *> &locals_to_free) +{ + if (p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals()) + { + // We have cached the args and can reuse them + return p_cpu_event->kernelArgs(); + } + + // We need to create them from scratch + void *rs; + + size_t args_size = 0; + + for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i) + { + const Kernel::Arg *arg = p_kernel->kernel()->arg(i); + CPUKernel::typeOffset(args_size, arg->valueSize() * arg->vecDim()); + } + + rs = std::malloc(args_size); + + if (!rs) + return NULL; + + size_t arg_offset = 0; + + for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i) + { + const Kernel::Arg *arg = p_kernel->kernel()->arg(i); + size_t size = arg->valueSize() * arg->vecDim(); + size_t offset = CPUKernel::typeOffset(arg_offset, size); + + // Where to place the argument + unsigned char *target = (unsigned char *)rs; + target += offset; + + // We may have to perform some changes in the values (buffers, etc) + switch (arg->kind()) + { + case Kernel::Arg::Buffer: + { + MemObject *buffer = *(MemObject **)arg->data(); + + if (arg->file() == Kernel::Arg::Local) + { + // Alloc a buffer and pass it to the kernel + void *local_buffer = std::malloc(arg->allocAtKernelRuntime()); + locals_to_free.push_back(local_buffer); + *(void **)target = local_buffer; + } + else + { + if (!buffer) + { + // We can do that, just send NULL + *(void **)target = NULL; + } + else + { + // Get the CPU buffer, allocate it and get its pointer + CPUBuffer *cpubuf = + (CPUBuffer *)buffer->deviceBuffer(p_kernel->device()); + void *buf_ptr = 0; + + buffer->allocate(p_kernel->device()); + buf_ptr = cpubuf->data(); + + *(void **)target = buf_ptr; + } + } + + break; + } + case Kernel::Arg::Image2D: + case Kernel::Arg::Image3D: + { + // We need to ensure the image is allocated + Image2D *image = *(Image2D **)arg->data(); + image->allocate(p_kernel->device()); + + // Fall through to the memcpy + } + default: + // Simply copy the arg's data into the buffer + std::memcpy(target, arg->data(), size); + break; + } + } + + // Cache the arguments if we can do so + if (!p_kernel->kernel()->hasLocals()) + p_cpu_event->cacheKernelArgs(rs); + + return rs; +} + +bool CPUKernelWorkGroup::run() +{ + // Get the kernel function to call + std::vector<void *> locals_to_free; + llvm::Function *kernel_func = p_kernel->callFunction(); + + if (!kernel_func) + return false; + + Program *p = (Program *)p_kernel->kernel()->parent(); + CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device())); + + // Make object usable for execution: (only applies to MCJIT): + prog->jit()->finalizeObject(); + + std::string kname = kernel_func->getName().str(); + + // original + p_kernel_func_addr = + (void(*)(void *))prog->jit()->getPointerToFunction(kernel_func); + + // TAG + // llvm::Function *t_func = prog->jit()->FindFunctionNamed(p_kernel->p_kernel->p_name->str()); +// llvm::Function *t_func = prog->jit()->FindFunctionNamed(p_kernel->kernel()->p_name.c_str()); +// p_kernel_func_addr = (void(*)(void *))prog->jit()->getPointerToFunction(t_func); + p_kernel_func_addr =(void(*)(void *)) prog->jit()->getFunctionAddress(kname); + + // Get the arguments + p_args = callArgs(locals_to_free); + + // Tell the builtins this thread will run a kernel work group + setThreadLocalWorkGroup(this); + + // Initialize the dummy context used by the builtins before a call to barrier() + p_current_work_item = 0; + p_current_context = &p_dummy_context; + + std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t)); + + do + { + // Simply call the "call function", it and the builtins will do the rest + p_kernel_func_addr(p_args); + } while (!p_had_barrier && + !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id)); + + // If no barrier() call was made, all is fine. If not, only the first + // work-item has currently finished. We must let the others run. + if (p_had_barrier) + { + Context *main_context = p_current_context; // After the first swapcontext, + // we will not be able to trust + // p_current_context anymore. + + // We'll call swapcontext for each remaining work-item. They will + // finish, and when they'll do so, this main context will be resumed, so + // it's easy (i starts from 1 because the main context already finished) + for (unsigned int i=1; i<p_num_work_items; ++i) + { + Context *ctx = getContextAddr(i); + swapcontext(&main_context->context, &ctx->context); + } + } + + // Free the allocated locals + if (p_kernel->kernel()->hasLocals()) + { + for (size_t i=0; i<locals_to_free.size(); ++i) + { + std::free(locals_to_free[i]); + } + + std::free(p_args); + } + + return true; +} + +CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index) +{ + size_t size; + char *data = (char *)p_contexts; + + // Each Context in data is an element of size p_stack_size + sizeof(Context) + size = p_stack_size + sizeof(Context); + size *= index; // To get an offset + + return (Context *)(data + size); // Pointer to the context +} diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h new file mode 100644 index 0000000..ab4d1ac --- /dev/null +++ b/src/core/cpu/kernel.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/kernel.h + * \brief CPU kernel + */ + +#ifndef __CPU_KERNEL_H__ +#define __CPU_KERNEL_H__ + +#include "../deviceinterface.h" +#include <core/config.h> + +#include <llvm/ExecutionEngine/GenericValue.h> +#include <vector> +#include <string> + +#include <ucontext.h> +#include <pthread.h> +#include <stdint.h> + +namespace llvm +{ + class Function; +} + +namespace Coal +{ + +class CPUDevice; +class Kernel; +class KernelEvent; +class Image2D; +class Image3D; + +/** + * \brief CPU kernel + * + * This class holds passive information about a kernel (\c Coal::Kernel object + * and device on which it is run) and provides the \c callFunction() function. + * + * This function is described at the end of \ref llvm . + * + * \see Coal::CPUKernelWorkGroup + */ +class CPUKernel : public DeviceKernel +{ + public: + /** + * \brief Constructor + * \param device device on which the kernel will be run + * \param kernel \c Coal::Kernel object holding information about this + * kernel + * \param function \c llvm::Function to run + */ + CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function); + ~CPUKernel(); + + size_t workGroupSize(); + cl_ulong localMemSize() const; + cl_ulong privateMemSize() const; + size_t preferredWorkGroupSizeMultiple() const; + size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim, + size_t global_work_size) const; + + Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */ + CPUDevice *device() const; /*!< \brief device on which the kernel will be run */ + + llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */ + llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */ + + /** + * \brief Calculate where to place a value in an array + * + * This function is used to calculate where to place a value in an + * array given its size, properly aligning it. + * + * This function is called repeatedly to obtain the aligned position of + * each value that must be place in the array + * + * \code + * size_t array_len = 0, array_offset = 0; + * void *array; + * + * // First, get the array size given alignment constraints + * typeOffset(array_len, sizeof(int)); + * typeOffset(array_len, sizeof(float)); + * typeOffset(array_len, sizeof(void *)); + * + * // Then, allocate memory + * array = malloc(array_len) + * + * // Finally, place the arguments + * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337; + * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f; + * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array; + * \endcode + * + * \param offset offset at which the value will be placed. This variable + * gets incremented by <tt>type_len + padding</tt>. + * \param type_len size in bytes of the value that will be stored + * \return offset at which the value will be stored (equal to \p offset + * before incrementation. + */ + static size_t typeOffset(size_t &offset, size_t type_len); + + private: + CPUDevice *p_device; + Kernel *p_kernel; + llvm::Function *p_function, *p_call_function; + pthread_mutex_t p_call_function_mutex; +}; + +class CPUKernelEvent; + +/** + * \brief CPU kernel work-group + * + * This class represent a bulk of work-items that will be run. It is the one + * to actually run the kernel of its elements. + * + * \see \ref llvm + * \nosubgrouping + */ +class CPUKernelWorkGroup +{ + public: + /** + * \brief Constructor + * \param kernel kernel to run + * \param event event containing information about the kernel run + * \param cpu_event CPU-specific information and cache about \p event + * \param work_group_index index of this work-group in the kernel + */ + CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, + CPUKernelEvent *cpu_event, + const size_t *work_group_index); + ~CPUKernelWorkGroup(); + + /** + * \brief Build a structure of arguments + * + * As C doesn't support calling functions with variable arguments + * unknown at the compilation, this function builds the list of + * arguments in memory. This array will then be passed to a LLVM stub + * function reading it and passing its values to the actuel kernel. + * + * \see \ref llvm + * \param locals_to_free if this kernel takes \c __local arguments, they + * must be \c malloc()'ed for every work-group. + * They are placed in this vector to be + * \c free()'ed at the end of \c run(). + * \return address of a memory location containing the arguments + */ + void *callArgs(std::vector<void *> &locals_to_free); + + /** + * \brief Run the work-group + * + * This function is the core of CPU-acceleration. It runs the work-items + * of this work-group given the correct arguments. + * + * \see \ref llvm + * \see \ref barrier + * \see callArgs() + * \return true if success, false in case of an error + */ + bool run(); + + /** + * \name Native implementation of built-in OpenCL C functions + * @{ + */ + size_t getGlobalId(cl_uint dimindx) const; + cl_uint getWorkDim() const; + size_t getGlobalSize(cl_uint dimindx) const; + size_t getLocalSize(cl_uint dimindx) const; + size_t getLocalID(cl_uint dimindx) const; + size_t getNumGroups(cl_uint dimindx) const; + size_t getGroupID(cl_uint dimindx) const; + size_t getGlobalOffset(cl_uint dimindx) const; + + void barrier(unsigned int flags); + + void *getImageData(Image2D *image, int x, int y, int z) const; + + void writeImage(Image2D *image, int x, int y, int z, float *color) const; + void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const; + void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const; + + void readImage(float *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + void readImage(int32_t *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + void readImage(uint32_t *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + + void readImage(float *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + void readImage(int32_t *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + void readImage(uint32_t *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + /** + * @} + */ + + /** + * \brief Function called when a built-in name cannot be found + */ + void builtinNotFound(const std::string &name) const; + + private: + template<typename T> + void writeImageImpl(Image2D *image, int x, int y, int z, T *color) const; + template<typename T> + void readImageImplI(T *result, Image2D *image, int x, int y, int z, + uint32_t sampler) const; + template<typename T> + void readImageImplF(T *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + template<typename T> + void linear3D(T *result, float a, float b, float c, + int i0, int j0, int k0, int i1, int j1, int k1, + Image3D *image) const; + template<typename T> + void linear2D(T *result, float a, float b, float c, int i0, int j0, + int i1, int j1, Image2D *image) const; + + private: + CPUKernel *p_kernel; + CPUKernelEvent *p_cpu_event; + KernelEvent *p_event; + cl_uint p_work_dim; + size_t p_index[MAX_WORK_DIMS], + p_max_local_id[MAX_WORK_DIMS], + p_global_id_start_offset[MAX_WORK_DIMS]; + + void (*p_kernel_func_addr)(void *); + void *p_args; + + // Machinery to have barrier() working + struct Context + { + size_t local_id[MAX_WORK_DIMS]; + ucontext_t context; + unsigned int initialized; + }; + + Context *getContextAddr(unsigned int index); + + Context *p_current_context; + Context p_dummy_context; + void *p_contexts; + size_t p_stack_size; + unsigned int p_num_work_items, p_current_work_item; + bool p_had_barrier; +}; + +/** + * \brief CPU-specific information about a kernel event + * + * This class put in a \c Coal::KernelEvent device-data field + * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the + * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads. + */ +class CPUKernelEvent +{ + public: + /** + * \brief Constructor + * \param device device running the kernel + * \param event \c Coal::KernelEvent holding device-agnostic data + * about the event + */ + CPUKernelEvent(CPUDevice *device, KernelEvent *event); + ~CPUKernelEvent(); + + bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */ + bool finished(); /*!< \brief All the work groups have finished */ + CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */ + + void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */ + void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */ + + void workGroupFinished(); /*!< \brief A work-group has just finished */ + + private: + CPUDevice *p_device; + KernelEvent *p_event; + size_t p_current_work_group[MAX_WORK_DIMS], + p_max_work_groups[MAX_WORK_DIMS]; + size_t p_current_wg, p_finished_wg, p_num_wg; + pthread_mutex_t p_mutex; + void *p_kernel_args; +}; + +} + +#endif diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp new file mode 100644 index 0000000..7eb632c --- /dev/null +++ b/src/core/cpu/program.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/program.cpp + * \brief CPU program + */ + +#include "program.h" +#include "device.h" +#include "kernel.h" +#include "builtins.h" + +#include "../program.h" + +#include <llvm/PassManager.h> +#include <llvm/Analysis/Passes.h> +#include <llvm/IR/Verifier.h> +#include <llvm/Transforms/Scalar.h> +#include <llvm/Transforms/IPO.h> +#include <llvm/ExecutionEngine/ExecutionEngine.h> +#include <llvm/ExecutionEngine/MCJIT.h> +#include <llvm/ExecutionEngine/SectionMemoryManager.h> +#include <llvm/ExecutionEngine/Interpreter.h> +#include <llvm/Support/ErrorHandling.h> + +#include <string> +#include <iostream> + +using namespace Coal; +using namespace llvm; + +// Create a custom memory manager for MCJIT +class ClientMemoryManager : public SectionMemoryManager +{ + ClientMemoryManager(const ClientMemoryManager&) LLVM_DELETED_FUNCTION; + void operator=(const ClientMemoryManager&) LLVM_DELETED_FUNCTION; + +public: + ClientMemoryManager() {} + virtual ~ClientMemoryManager() {} + + /// This method returns the (host) address of the specified function. + virtual uint64_t getSymbolAddress(const std::string &Name); +}; + +uint64_t ClientMemoryManager::getSymbolAddress(const std::string &Name) +{ + // Try the standard symbol resolution first, but ask it not to abort. + uint64_t addr = RTDyldMemoryManager::getSymbolAddress(Name); + if (!addr) { + addr = (uint64_t)getBuiltin(Name); + } + + if (!addr) + report_fatal_error("OpenCL program references external function '" + Name + + "' which could not be resolved!"); + return addr; +} + +CPUProgram::CPUProgram(CPUDevice *device, Program *program) +: DeviceProgram(), p_device(device), p_program(program), p_jit(0) +{ + +} + +CPUProgram::~CPUProgram() +{ + if (p_jit) + { + // Dont delete the module + p_jit->removeModule(p_module); + + delete p_jit; + } +} + +bool CPUProgram::linkStdLib() const +{ + return true; +} + +void CPUProgram::createOptimizationPasses(llvm::PassManager *manager, + bool optimize, bool hasBarrier) +{ + if (optimize) + { + /* + * Inspired by code from "The LLVM Compiler Infrastructure" + */ + manager->add(llvm::createDeadArgEliminationPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createFunctionInliningPass()); + manager->add(llvm::createPruneEHPass()); // Remove dead EH info. + manager->add(llvm::createGlobalOptimizerPass()); + manager->add(llvm::createGlobalDCEPass()); // Remove dead functions. + manager->add(llvm::createArgumentPromotionPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createJumpThreadingPass()); + manager->add(llvm::createScalarReplAggregatesPass()); + manager->add(llvm::createFunctionAttrsPass()); // Add nocapture. + manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis. + manager->add(llvm::createLICMPass()); // Hoist loop invariants. + manager->add(llvm::createGVNPass()); // Remove redundancies. + manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys. + manager->add(llvm::createDeadStoreEliminationPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createJumpThreadingPass()); + manager->add(llvm::createCFGSimplificationPass()); + } +} + +bool CPUProgram::build(llvm::Module *module, std::string *binary_str) +{ + // Nothing to build + p_module = module; + + return true; +} + +bool CPUProgram::initJIT() +{ + if (p_jit) + return true; + + if (!p_module) + return false; + + // Create the JIT + std::string err; + + p_jit = llvm::EngineBuilder(p_module) + .setErrorStr(&err) + .setUseMCJIT(true) + .setMCJITMemoryManager(new ClientMemoryManager()) + .create(); + + if (!p_jit) + { + std::cout << "Unable to create a JIT: " << err << std::endl; + return false; + } + + return true; +} + +llvm::ExecutionEngine *CPUProgram::jit() const +{ + return p_jit; +} diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h new file mode 100644 index 0000000..0a08d61 --- /dev/null +++ b/src/core/cpu/program.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/program.h + * \brief CPU program + */ + +#ifndef __CPU_PROGRAM_H__ +#define __CPU_PROGRAM_H__ + +#include "../deviceinterface.h" + +namespace llvm +{ + class ExecutionEngine; + class Module; +} + +namespace Coal +{ + +class CPUDevice; +class Program; + +/** + * \brief CPU program + * + * This class implements the \c Coal::DeviceProgram interface for CPU + * acceleration. + * + * It's main purpose is to initialize a \c llvm::JIT object to run LLVM bitcode, + * in \c initJIT(). + */ +class CPUProgram : public DeviceProgram +{ + public: + /** + * \brief Constructor + * \param device CPU device to which this program is attached + * \param program \c Coal::Program that will be run + */ + CPUProgram(CPUDevice *device, Program *program); + ~CPUProgram(); + + bool linkStdLib() const; + void createOptimizationPasses(llvm::PassManager *manager, + bool optimize, bool hasBarrier=false); + bool build(llvm::Module *module, std::string *binary_str); + + /** + * \brief Initialize an LLVM JIT + * + * This function creates a \c llvm::JIT object to run this program on + * the CPU. A few implementation details : + * + * - The JIT is set not to resolve unknown symbols using \c dlsym(). + * This way, a malicious kernel cannot execute arbitrary code on + * the host by declaring \c libc functions and calling them. + * - All the unknown function names are passed to \c getBuiltin() to + * get native built-in implementations. + * + * \return true if success, false otherwise + */ + bool initJIT(); + llvm::ExecutionEngine *jit() const; /*!< \brief Current LLVM execution engine */ + + private: + CPUDevice *p_device; + Program *p_program; + + llvm::ExecutionEngine *p_jit; + llvm::Module *p_module; +}; + +} + +#endif diff --git a/src/core/cpu/sampler.cpp b/src/core/cpu/sampler.cpp new file mode 100644 index 0000000..893e66e --- /dev/null +++ b/src/core/cpu/sampler.cpp @@ -0,0 +1,769 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/sampler.cpp + * \brief OpenCL C image access functions + * + * It is recommended to compile this file using Clang as it supports the + * \c __builtin_shufflevector() built-in function, providing SSE or + * NEON-accelerated code. + */ + +#include "../memobject.h" +#include "../sampler.h" +#include "kernel.h" +#include "buffer.h" +#include "builtins.h" + +#include <cstdlib> +#include <cmath> +// ASW #include <immintrin.h> + +using namespace Coal; + +/* + * Helper functions + */ + +static int clamp(int a, int b, int c) +{ + return (a < b) ? b : ((a > c) ? c : a); +} + +static int min(int a, int b) +{ + return (a < b ? a : b); +} + +static int max(int a, int b) +{ + return (a > b ? a : b); +} + +static float frac(float x) +{ + return x - std::floor(x); +} + +static float round(float x) +{ + return (float)(int)x; +} + +static bool handle_address_mode(Image2D *image, int &x, int &y, int &z, + uint32_t sampler) +{ + bool is_3d = (image->type() == MemObject::Image3D); + int w = image->width(), + h = image->height(), + d = (is_3d ? ((Image3D *)image)->depth() : 1); + + if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP_TO_EDGE) + { + x = clamp(x, 0, w - 1); + y = clamp(y, 0, h - 1); + if (is_3d) z = clamp(z, 0, d - 1); + } + else if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP) + { + x = clamp(x, 0, w); + y = clamp(y, 0, h); + if (is_3d) z = clamp(z, 0, d); + } + + return (x == w || y == h || z == d); +} + +/* + * Macros or functions used to accelerate the functions + */ +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +static void slow_shuffle4(uint32_t *rs, uint32_t *a, uint32_t *b, + int x, int y, int z, int w) +{ + rs[0] = (x < 4 ? a[x] : b[x - 4]); + rs[1] = (y < 4 ? a[y] : b[y - 4]); + rs[2] = (z < 4 ? a[z] : b[z - 4]); + rs[3] = (w < 4 ? a[w] : b[w - 4]); +} + +static void convert_to_format(void *dest, float *data, + cl_channel_type type, unsigned int channels) +{ + // Convert always the four components of source to target + if (type == CL_FLOAT) + std::memcpy(dest, data, channels * sizeof(float)); + + for (unsigned int i=0; i<channels; ++i) + { + switch (type) + { + case CL_SNORM_INT8: + ((int8_t *)dest)[i] = data[i] * 128.0f; + break; + case CL_SNORM_INT16: + ((int16_t *)dest)[i] = data[i] * 32767.0f; + break; + case CL_UNORM_INT8: + ((uint8_t *)dest)[i] = data[i] * 255.0f; + break; + case CL_UNORM_INT16: + ((uint16_t *)dest)[i] = data[i] * 65535.0f; + break; + } + } +} + +static void convert_from_format(float *data, void *source, + cl_channel_type type, unsigned int channels) +{ + // Convert always the four components of source to target + if (type == CL_FLOAT) + std::memcpy(data, source, channels * sizeof(float)); + + for (unsigned int i=0; i<channels; ++i) + { + switch (type) + { + case CL_SNORM_INT8: + data[i] = (float)((int8_t *)source)[i] / 127.0f; + break; + case CL_SNORM_INT16: + data[i] = (float)((int16_t *)source)[i] / 32767.0f; + break; + case CL_UNORM_INT8: + data[i] = (float)((uint8_t *)source)[i] / 127.0f; + break; + case CL_UNORM_INT16: + data[i] = (float)((uint16_t *)source)[i] / 127.0f; + break; + } + } +} + +static void convert_to_format(void *dest, int *data, + cl_channel_type type, unsigned int channels) +{ + // Convert always the four components of source to target + if (type == CL_SIGNED_INT32) + std::memcpy(dest, data, channels * sizeof(int32_t)); + + for (unsigned int i=0; i<channels; ++i) + { + switch (type) + { + case CL_SIGNED_INT8: + ((int8_t *)dest)[i] = data[i]; + break; + case CL_SIGNED_INT16: + ((int16_t *)dest)[i] = data[i]; + break; + } + } +} + +static void convert_from_format(int32_t *data, void *source, + cl_channel_type type, unsigned int channels) +{ + // Convert always the four components of source to target + if (type == CL_SIGNED_INT32) + std::memcpy(data, source, channels * sizeof(int32_t)); + + for (unsigned int i=0; i<channels; ++i) + { + switch (type) + { + case CL_SIGNED_INT8: + data[i] = ((int8_t *)source)[i]; + break; + case CL_SIGNED_INT16: + data[i] = ((int16_t *)source)[i]; + break; + } + } +} + +static void convert_to_format(void *dest, uint32_t *data, + cl_channel_type type, unsigned int channels) +{ + // Convert always the four components of source to target + if (type == CL_UNSIGNED_INT32) + std::memcpy(dest, data, channels * sizeof(uint32_t)); + + for (unsigned int i=0; i<3; ++i) + { + switch (type) + { + case CL_UNSIGNED_INT8: + ((uint8_t *)dest)[i] = data[i]; + break; + case CL_UNSIGNED_INT16: + ((uint16_t *)dest)[i] = data[i]; + break; + } + } +} + +static void convert_from_format(uint32_t *data, void *source, + cl_channel_type type, unsigned int channels) +{ + // Convert always the four components of source to target + if (type == CL_UNSIGNED_INT32) + std::memcpy(data, source, channels * sizeof(uint32_t)); + + for (unsigned int i=0; i<channels; ++i) + { + switch (type) + { + case CL_UNSIGNED_INT8: + data[i] = ((uint8_t *)source)[i]; + break; + case CL_UNSIGNED_INT16: + data[i] = ((uint16_t *)source)[i]; + break; + } + } +} + +template<typename T> +static void vec4_scalar_mul(T *vec, float val) +{ + for (unsigned int i=0; i<4; ++i) + vec[i] *= val; +} + +template<typename T> +static void vec4_add(T *vec1, T *vec2) +{ + for (unsigned int i=0; i<4; ++i) + vec1[i] += vec2[i]; +} + +template<typename T> +void CPUKernelWorkGroup::linear3D(T *result, float a, float b, float c, + int i0, int j0, int k0, int i1, int j1, int k1, + Image3D *image) const +{ + T accum[4]; + + readImageImplI<T>(result, image, i0, j0, k0, 0); + vec4_scalar_mul(result, (1.0f - a) * (1.0f - b) * (1.0f - c )); + + readImageImplI<T>(accum, image, i1, j0, k0, 0); + vec4_scalar_mul(accum, a * (1.0f - b) * (1.0f - c )); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j1, k0, 0); + vec4_scalar_mul(accum, (1.0f - a) * b * (1.0f - c )); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j1, k0, 0); + vec4_scalar_mul(accum, a * b * (1.0f -c )); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j0, k1, 0); + vec4_scalar_mul(accum, (1.0f - a) * (1.0f - b) * c); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j0, k1, 0); + vec4_scalar_mul(accum, a * (1.0f - b) * c); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j1, k1, 0); + vec4_scalar_mul(accum, (1.0f - a) * b * c); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j1, k1, 0); + vec4_scalar_mul(accum, a * b * c); + vec4_add(result, accum); +} + +template<typename T> +void CPUKernelWorkGroup::linear2D(T *result, float a, float b, float c, int i0, int j0, + int i1, int j1, Image2D *image) const +{ + T accum[4]; + + readImageImplI<T>(result, image, i0, j0, 0, 0); + vec4_scalar_mul(result, (1.0f - a) * (1.0f - b)); + + readImageImplI<T>(accum, image, i1, j0, 0, 0); + vec4_scalar_mul(accum, a * (1.0f - b)); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j1, 0, 0); + vec4_scalar_mul(accum, (1.0f - a) * b); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j1, 0, 0); + vec4_scalar_mul(accum, a * b); + vec4_add(result, accum); +} + +#if __has_builtin(__builtin_shufflevector) + #define shuffle4(rs, a, b, x, y, z, w) \ + *(__v4sf *)rs = __builtin_shufflevector(*(__v4sf *)a, *(__v4sf *)b, \ + x, y, z, w) +#else + #define shuffle4(rs, a, b, x, y, z, w) \ + slow_shuffle4(rs, a, b, x, y, z, w) +#endif + +static void swizzle(uint32_t *target, uint32_t *source, + cl_channel_order order, bool reading, uint32_t t_max) +{ + uint32_t special[4] = {0, t_max, 0, 0 }; + + if (reading) + { + switch (order) + { + case CL_R: + case CL_Rx: + // target = {source->x, 0, 0, t_max} + shuffle4(target, source, special, 0, 4, 4, 5); + break; + case CL_A: + // target = {0, 0, 0, source->x} + shuffle4(target, source, special, 4, 4, 4, 0); + break; + case CL_INTENSITY: + // target = {source->x, source->x, source->x, source->x} + shuffle4(target, source, source, 0, 0, 0, 0); + break; + case CL_LUMINANCE: + // target = {source->x, source->x, source->x, t_max} + shuffle4(target, source, special, 0, 0, 0, 5); + break; + case CL_RG: + case CL_RGx: + // target = {source->x, source->y, 0, t_max} + shuffle4(target, source, special, 0, 1, 4, 5); + break; + case CL_RA: + // target = {source->x, 0, 0, source->y} + shuffle4(target, source, special, 0, 4, 4, 1); + break; + case CL_RGB: + case CL_RGBx: + case CL_RGBA: + // Nothing to do, already the good order + std::memcpy(target, source, 16); + break; + case CL_ARGB: + // target = {source->y, source->z, source->w, source->x} + shuffle4(target, source, source, 1, 2, 3, 0); + break; + case CL_BGRA: + // target = {source->z, source->y, source->x, source->w} + shuffle4(target, source, source, 2, 1, 0, 3); + break; + } + } + else + { + switch (order) + { + case CL_A: + // target = {source->w, undef, undef, undef} + shuffle4(target, source, source, 3, 3, 3, 3); + break; + case CL_RA: + // target = {source->x, source->w, undef, undef} + shuffle4(target, source, source, 0, 3, 3, 3); + break; + case CL_ARGB: + // target = {source->w, source->x, source->y, source->z} + shuffle4(target, source, source, 3, 0, 1, 2); + break; + case CL_BGRA: + // target = {source->z, source->y, source->x, source->w} + shuffle4(target, source, source, 2, 1, 0, 3); + break; + default: + std::memcpy(target, source, 16); + } + } +} + +/* + * Actual implementation of the built-ins + */ + +void *CPUKernelWorkGroup::getImageData(Image2D *image, int x, int y, int z) const +{ + CPUBuffer *buffer = + (CPUBuffer *)image->deviceBuffer((DeviceInterface *)p_kernel->device()); + + return imageData((unsigned char *)buffer->data(), + x, y, z, + image->row_pitch(), + image->slice_pitch(), + image->pixel_size()); +} + +template<typename T> +void CPUKernelWorkGroup::writeImageImpl(Image2D *image, int x, int y, int z, + T *color) const +{ + T converted[4]; + + // Swizzle to the correct order (float, int and uint are 32-bit, so the + // type has no importance + swizzle((uint32_t *)converted, (uint32_t *)color, + image->format().image_channel_order, false, 0); + + // Get a pointer in the image where to write the data + void *target = getImageData(image, x, y, z); + + // Convert color to the correct format + convert_to_format(target, + converted, + image->format().image_channel_data_type, + image->channels()); +} + +void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z, + float *color) const +{ + writeImageImpl<float>(image, x, y, z, color); +} + +void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z, + int32_t *color) const +{ + writeImageImpl<int32_t>(image, x, y, z, color); +} + +void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z, + uint32_t *color) const +{ + writeImageImpl<uint32_t>(image, x, y, z, color); +} + +template<typename T> +uint32_t type_max_value() +{ + return 0; +} + +template<> +uint32_t type_max_value<float>() +{ + return 1065353216; // 1.0f in decimal form +} + +template<> +uint32_t type_max_value<int32_t>() +{ + return 0x7fffffff; +} + +template<> +uint32_t type_max_value<uint32_t>() +{ + return 0xffffffff; +} + +template<typename T> +void CPUKernelWorkGroup::readImageImplI(T *result, Image2D *image, int x, int y, + int z, uint32_t sampler) const +{ + // Handle the addressing mode of the sampler + if (handle_address_mode(image, x, y, z, sampler)) + { + // Border color + result[0] = 0.0f; + result[1] = 0.0f; + result[2] = 0.0f; + + switch (image->format().image_channel_order) + { + case CL_R: + case CL_RG: + case CL_RGB: + case CL_LUMINANCE: + result[3] = 1.0f; + break; + default: + result[3] = 0.0f; + } + + return; + } + + // Load the data from the image, converting it + void *source = getImageData(image, x, y, z); + T converted[4]; + + convert_from_format(converted, + source, + image->format().image_channel_data_type, + image->channels()); + + // Swizzle the pixel just read and place it in result + swizzle((uint32_t *)result, (uint32_t *)converted, + image->format().image_channel_order, true, type_max_value<T>()); +} + +void CPUKernelWorkGroup::readImage(float *result, Image2D *image, int x, int y, + int z, uint32_t sampler) const +{ + readImageImplI<float>(result, image, x, y, z, sampler); +} + +void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, int x, int y, + int z, uint32_t sampler) const +{ + readImageImplI<int32_t>(result, image, x, y, z, sampler); +} + +void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, int x, int y, + int z, uint32_t sampler) const +{ + readImageImplI<uint32_t>(result, image, x, y, z, sampler); +} + +template<typename T> +void CPUKernelWorkGroup::readImageImplF(T *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + bool is_3d = (image->type() == MemObject::Image3D); + Image3D *image3d = (Image3D *)image; + + int w = image->width(), + h = image->height(), + d = (is_3d ? image3d->depth() : 1); + + switch (sampler & 0xf0) + { + case CLK_ADDRESS_NONE: + case CLK_ADDRESS_CLAMP: + case CLK_ADDRESS_CLAMP_TO_EDGE: + /* De-normalize coordinates */ + if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE) + { + x *= (float)w; + y *= (float)h; + if (is_3d) z *= (float)d; + } + + switch (sampler & 0xf00) + { + case CLK_FILTER_NEAREST: + { + readImageImplI<T>(result, image, std::floor(x), + std::floor(y), std::floor(z), sampler); + } + case CLK_FILTER_LINEAR: + { + float a, b, c; + + a = frac(x - 0.5f); + b = frac(y - 0.5f); + c = frac(z - 0.5f); + + if (is_3d) + { + linear3D<T>(result, a, b, c, + std::floor(x - 0.5f), + std::floor(y - 0.5f), + std::floor(z - 0.5f), + std::floor(x - 0.5f) + 1, + std::floor(y - 0.5f) + 1, + std::floor(z - 0.5f) + 1, + image3d); + } + else + { + linear2D<T>(result, a, b, c, + std::floor(x - 0.5f), + std::floor(y - 0.5f), + std::floor(x - 0.5f) + 1, + std::floor(y - 0.5f) + 1, + image); + } + } + } + break; + case CLK_ADDRESS_REPEAT: + switch (sampler & 0xf00) + { + case CLK_FILTER_NEAREST: + { + int i, j, k; + + x = (x - std::floor(x)) * (float)w; + i = std::floor(x); + if (i > w - 1) + i = i - w; + + y = (y - std::floor(y)) * (float)h; + j = std::floor(y); + if (j > h - 1) + j = j - h; + + if (is_3d) + { + z = (z - std::floor(z)) * (float)d; + k = std::floor(z); + if (k > d - 1) + k = k - d; + } + + readImageImplI<T>(result, image, i, j, k, sampler); + } + case CLK_FILTER_LINEAR: + { + float a, b, c; + int i0, i1, j0, j1, k0, k1; + + x = (x - std::floor(x)) * (float)w; + i0 = std::floor(x - 0.5f); + i1 = i0 + 1; + if (i0 < 0) + i0 = w + i0; + if (i1 > w - 1) + i1 = i1 - w; + + y = (y - std::floor(y)) * (float)h; + j0 = std::floor(y - 0.5f); + j1 = j0 + 1; + if (j0 < 0) + j0 = h + j0; + if (j1 > h - 1) + j1 = j1 - h; + + if (is_3d) + { + z = (z - std::floor(z)) * (float)d; + k0 = std::floor(z - 0.5f); + k1 = k0 + 1; + if (k0 < 0) + k0 = d + k0; + if (k1 > d - 1) + k1 = k1 - d; + } + + a = frac(x - 0.5f); + b = frac(y - 0.5f); + c = frac(z - 0.5f); + + if (is_3d) + { + linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1, + image3d); + } + else + { + linear2D<T>(result, a, b, c, i0, j0, i1, j1, image); + } + } + } + break; + case CLK_ADDRESS_MIRRORED_REPEAT: + switch (sampler & 0xf00) + { + case CLK_FILTER_NEAREST: + { + x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w; + y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h; + if (is_3d) + z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d; + + readImageImplI<T>(result, image, + min(std::floor(x), w - 1), + min(std::floor(y), h - 1), + min(std::floor(z), d - 1), + sampler); + } + case CLK_FILTER_LINEAR: + { + float a, b, c; + int i0, i1, j0, j1, k0, k1; + + x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w; + i0 = std::floor(x - 0.5f); + i1 = i0 + 1; + i0 = max(i0, 0); + i1 = min(i1, w - 1); + + y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h; + j0 = std::floor(y - 0.5f); + j1 = j0 + 1; + j0 = max(j0, 0); + j1 = min(j1, h - 1); + + if (is_3d) + { + z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d; + k0 = std::floor(z - 0.5f); + k1 = k0 + 1; + k0 = max(k0, 0); + k1 = min(k1, d - 1); + } + + a = frac(x - 0.5f); + b = frac(y - 0.5f); + c = frac(z - 0.5f); + + if (is_3d) + { + linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1, + image3d); + } + else + { + linear2D<T>(result, a, b, c, i0, j0, i1, j1, image); + } + } + } + break; + } +} + +void CPUKernelWorkGroup::readImage(float *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + readImageImplF<float>(result, image, x, y, z, sampler); +} + +void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + readImageImplF<int32_t>(result, image, x, y, z, sampler); +} + +void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + readImageImplF<uint32_t>(result, image, x, y, z, sampler); +} diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp new file mode 100644 index 0000000..e5251f2 --- /dev/null +++ b/src/core/cpu/worker.cpp @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/worker.cpp + * \brief Code running in the worker threads launched by \c Coal::CPUDevice + * \sa builtins.cpp + */ + +#include "worker.h" +#include "device.h" +#include "buffer.h" +#include "kernel.h" +#include "builtins.h" + +#include "../commandqueue.h" +#include "../events.h" +#include "../memobject.h" +#include "../kernel.h" + +#include <sys/mman.h> + +#include <cstring> +#include <iostream> + +using namespace Coal; + +void *worker(void *data) +{ + CPUDevice *device = (CPUDevice *)data; + bool stop = false; + cl_int errcode; + Event *event; + + // Initialize TLS + setWorkItemsData(0, 0); + + while (true) + { + event = device->getEvent(stop); + + // Ensure we have a good event and we don't have to stop + if (stop) break; + if (!event) continue; + + // Get info about the event and its command queue + Event::Type t = event->type(); + CommandQueue *queue = 0; + cl_command_queue_properties queue_props = 0; + + errcode = CL_SUCCESS; + + event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0); + + if (queue) + queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), + &queue_props, 0); + + if (queue_props & CL_QUEUE_PROFILING_ENABLE) + event->updateTiming(Event::Start); + + // Execute the action + switch (t) + { + case Event::ReadBuffer: + case Event::WriteBuffer: + { + ReadWriteBufferEvent *e = (ReadWriteBufferEvent *)event; + CPUBuffer *buf = (CPUBuffer *)e->buffer()->deviceBuffer(device); + char *data = (char *)buf->data(); + + data += e->offset(); + + if (t == Event::ReadBuffer) + std::memcpy(e->ptr(), data, e->cb()); + else std::memcpy(data, e->ptr(), e->cb()); + + break; + } + case Event::CopyBuffer: + { + CopyBufferEvent *e = (CopyBufferEvent *)event; + CPUBuffer *src = (CPUBuffer *)e->source()->deviceBuffer(device); + CPUBuffer *dst = (CPUBuffer *)e->destination()->deviceBuffer(device); + + std::memcpy((char*)dst->data() + e->dst_offset(), + (char*)src->data() + e->src_offset(), e->cb()); + break; + } + case Event::ReadBufferRect: + case Event::WriteBufferRect: + case Event::CopyBufferRect: + case Event::ReadImage: + case Event::WriteImage: + case Event::CopyImage: + case Event::CopyBufferToImage: + case Event::CopyImageToBuffer: + { + // src = buffer and dst = mem if note copy + ReadWriteCopyBufferRectEvent *e = (ReadWriteCopyBufferRectEvent *)event; + CPUBuffer *src_buf = (CPUBuffer *)e->source()->deviceBuffer(device); + + unsigned char *src = (unsigned char *)src_buf->data(); + unsigned char *dst; + + switch (t) + { + case Event::CopyBufferRect: + case Event::CopyImage: + case Event::CopyImageToBuffer: + case Event::CopyBufferToImage: + { + CopyBufferRectEvent *cbre = (CopyBufferRectEvent *)e; + CPUBuffer *dst_buf = + (CPUBuffer *)cbre->destination()->deviceBuffer(device); + + dst = (unsigned char *)dst_buf->data(); + break; + } + default: + { + // dst = host memory location + ReadWriteBufferRectEvent *rwbre = (ReadWriteBufferRectEvent *)e; + + dst = (unsigned char *)rwbre->ptr(); + } + } + + // Iterate over the lines to copy and use memcpy + for (size_t z=0; z<e->region(2); ++z) + { + for (size_t y=0; y<e->region(1); ++y) + { + unsigned char *s; + unsigned char *d; + + d = imageData(dst, + e->dst_origin(0), + y + e->dst_origin(1), + z + e->dst_origin(2), + e->dst_row_pitch(), + e->dst_slice_pitch(), + 1); + + s = imageData(src, + e->src_origin(0), + y + e->src_origin(1), + z + e->src_origin(2), + e->src_row_pitch(), + e->src_slice_pitch(), + 1); + + // Copying and image to a buffer may need to add an offset + // to the buffer address (its rectangular origin is + // always (0, 0, 0)). + if (t == Event::CopyBufferToImage) + { + CopyBufferToImageEvent *cptie = (CopyBufferToImageEvent *)e; + s += cptie->offset(); + } + else if (t == Event::CopyImageToBuffer) + { + CopyImageToBufferEvent *citbe = (CopyImageToBufferEvent *)e; + d += citbe->offset(); + } + + if (t == Event::WriteBufferRect || t == Event::WriteImage) + std::memcpy(s, d, e->region(0)); // Write dest (memory) in src + else + std::memcpy(d, s, e->region(0)); // Write src (buffer) in dest (memory), or copy the buffers + } + } + + break; + } + case Event::MapBuffer: + case Event::MapImage: + // All was already done in CPUBuffer::initEventDeviceData() + break; + + case Event::NativeKernel: + { + NativeKernelEvent *e = (NativeKernelEvent *)event; + void (*func)(void *) = (void (*)(void *))e->function(); + void *args = e->args(); + + func(args); + + break; + } + case Event::NDRangeKernel: + case Event::TaskKernel: + { + KernelEvent *e = (KernelEvent *)event; + CPUKernelEvent *ke = (CPUKernelEvent *)e->deviceData(); + + // Take an instance + CPUKernelWorkGroup *instance = ke->takeInstance(); + ke = 0; // Unlocked, don't use anymore + + if (!instance->run()) + errcode = CL_INVALID_PROGRAM_EXECUTABLE; + + delete instance; + + break; + } + default: + break; + } + + // Cleanups + if (errcode == CL_SUCCESS) + { + bool finished = true; + + if (event->type() == Event::NDRangeKernel || + event->type() == Event::TaskKernel) + { + CPUKernelEvent *ke = (CPUKernelEvent *)event->deviceData(); + finished = ke->finished(); + } + + if (finished) + { + // an event may be released once it is Complete + if (queue_props & CL_QUEUE_PROFILING_ENABLE) + event->updateTiming(Event::End); + event->setStatus(Event::Complete); + } + } + else + { + // an event may be released once it is Complete + if (queue_props & CL_QUEUE_PROFILING_ENABLE) + event->updateTiming(Event::End); + // The event failed + event->setStatus((Event::Status)errcode); + } + } + + // Free mmapped() data if needed + size_t mapped_size; + void *mapped_data = getWorkItemsData(mapped_size); + + if (mapped_data) + munmap(mapped_data, mapped_size); + + return 0; +} diff --git a/src/core/cpu/worker.h b/src/core/cpu/worker.h new file mode 100644 index 0000000..43ddd03 --- /dev/null +++ b/src/core/cpu/worker.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file worker.h + * \brief Function run by the CPU worker threads + */ + +#ifndef __CPU_WORKER_H__ +#define __CPU_WORKER_H__ + +/** + * \brief Main loop of the CPU worker threads + * + * This function is run by as many thread as they are CPU cores on the host + * system. As explained by \ref events , this function waits until there + * are \c Coal::Event objects to process and handle them. + */ +void *worker(void *data); + +#endif diff --git a/src/core/deviceinterface.h b/src/core/deviceinterface.h new file mode 100644 index 0000000..a321a9e --- /dev/null +++ b/src/core/deviceinterface.h @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file deviceinterface.h + * \brief Abstraction layer between Clover core and the devices + */ + +#ifndef __DEVICEINTERFACE_H__ +#define __DEVICEINTERFACE_H__ + +#include <CL/cl.h> +#include <string> +#include "object.h" + +/* This pulls in legacy::PassManager when LLVM >= 3.4 */ +#include <llvm/PassManager.h> + +namespace Coal +{ + +class DeviceBuffer; +class DeviceProgram; +class DeviceKernel; + +class MemObject; +class Event; +class Program; +class Kernel; + +/** + * \brief Abstraction layer between core Clover objects and the devices + * + * This interface is used by the core Clover classes to communicate with the + * devices, that must reimplement all the functions described here. + */ +class DeviceInterface : public Object +{ + public: + DeviceInterface() : Object(Object::T_Device, 0) {} + virtual ~DeviceInterface() {} + + /** + * \brief Retrieve information about the device + * + * This function is used to retrieve information about an object. + * Sometimes, the size of the data retrieved is unknown (for example, a + * string). The application can call this function twice, the first time + * to get the size, then it allocates a buffer, and finally get the data. + * + * \code + * const char *string = 0; + * size_t len; + * + * object->info(FOO_PROPERTY_STRING, 0, 0, &len); + * string = std::malloc(len); + * object->info(FOO_PROPERTY_STRING, len, string, 0); + * \endcode + * + * \param param_name Name of the property to retrieve + * \param param_value_size Size of the application-allocated buffer + * in which to put the value. + * \param param_value Pointer to an application-allocated buffer + * where the property data will be stored. Ignored + * if NULL. + * \param param_value_size_ret Size of the value retrieved, ignored if + * NULL. + * \return CL_SUCCESS in case of success, otherwise a CL error code. + */ + virtual cl_int info(cl_device_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const = 0; + + /** + * \brief Create a \c Coal::DeviceBuffer object for this device + * \param buffer Memory object for which the buffer has to be created + * \param rs Error code (\c CL_SUCCESS if no error) + * \return a \c Coal::DeviceBuffer object, undefined if there is an error + */ + virtual DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs) = 0; + + /** + * \brief Create a \c Coal::DeviceProgram object for this device + * \param program \c Coal::Program containing the device-independent + * program data + * \return a \c Coal::DeviceProgram object + */ + virtual DeviceProgram *createDeviceProgram(Program *program) = 0; + + /** + * \brief Create a \c Coal::DeviceKernel object for this device + * \param kernel \c Coal::Kernel containing the device-independent kernel + * data + * \param function device-specific \c llvm::Function to be used + * \return a \c Coal::DeviceKernel object + */ + virtual DeviceKernel *createDeviceKernel(Kernel *kernel, + llvm::Function *function) = 0; + + /** + * \brief Push an event on the device + * \sa the end of \ref events + * \param event the event to be pushed + */ + virtual void pushEvent(Event *event) = 0; + + /** + * \brief Initialize device-specific event data + * + * This call allows a device to initialize device-specific event data, + * by using \c Coal::Event::setDeviceData(). For instance, an + * hardware-accelerated device can associate a device command to an + * event, and use it to manage the event when it gets pushed. + * + * @note This function has one obligation: it must call + * \c Coal::MapBufferEvent::setPtr() and + * \c Coal::MapImageEvent::setPtr() (and other function described + * in its documentation) + * + * \param event the event for which data can be set + * \return CL_SUCCESS in case of success + */ + virtual cl_int initEventDeviceData(Event *event) = 0; + + /** + * \brief Free device-specific event data + * + * This function is called just before \p event gets deleted. It allows + * a device to free device-specific data of this event, if any. + * + * \param event the event that will be destroyed + */ + virtual void freeEventDeviceData(Event *event) = 0; + + virtual std::string builtinsHeader(void) const = 0; + + virtual void init() = 0; + + /** + * \brief Ask device if it has enough work in its queue + */ + virtual bool gotEnoughToWorkOn() { return false; } +}; + +/** + * \brief Device-specific memory buffer + * + * This class is the backing-store used on a device for a \c Coal::MemObject. It + * is created by \c Coal::DeviceInterface::createDeviceBuffer(). + */ +class DeviceBuffer +{ + public: + DeviceBuffer() {} + virtual ~DeviceBuffer() {} + + /** + * \brief Allocate the buffer on the device + * \return true when success, false otherwise + */ + virtual bool allocate() = 0; + + /** + * \brief \c Coal::DeviceInterface of this buffer + * \return parent \c Coal::DeviceInterface + */ + virtual DeviceInterface *device() const = 0; + + /** + * \brief Allocation status + * \return true if already allocated, false otherwise + */ + virtual bool allocated() const = 0; + + /** + * \brief Host-accessible memory pointer + * + * This function returns what is passed as arguments to native kernels + * (\c clEnqueueNativeKernel(), \c Coal::NativeKernelEvent) in place of + * \c Coal::MemObject pointers. + * + * For \c Coal::CPUDevice, it's simply a pointer in RAM, but + * hardware-accelerated devices may need to do some copying or mapping. + * + * \warning Beware that this data may get written to by the native kernel. + * + * \return A memory pointer usable by a host native kernel + */ + virtual void *nativeGlobalPointer() const = 0; +}; + +/** + * \brief Device-specific program data + */ +class DeviceProgram +{ + public: + DeviceProgram() {} + virtual ~DeviceProgram() {} + + /** + * \brief Linking or not \b stdlib with this program + * + * \b stdlib is a LLVM bitcode file containing some implementations of + * OpenCL C built-ins. This function allows a device to tell + * \c Coal::Program::build() if it wants \b stdlib to be linked or not. + * + * Linking the library may allow inlining of functions like \c ceil(), + * \c floor(), \c clamp(), etc. So, if these functions are not better + * handled by the device itself than by \b stdlib, it's a good thing + * to link it. + * + * But if the device provides instructions for these functions, then + * it could be better not to link \b stdlib and to replace the LLVM + * calls to these functions with device-specific instructions. + * + * \warning \b Stdlib currently only works for \c Coal::CPUDevice, as + * it contains host-specific code (LLVM IR is not meant to be + * portable, pointer size changes for example). + * + * \return true if \b stdlib must be linked with the program + */ + virtual bool linkStdLib() const = 0; + + /** + * \brief Create device-specific optimization passes + * + * This hook allows a device to add LLVM optimization passes to a + * \c llvm::PassManager . This way, devices needing function flattening + * or special analysis passes can have them run on the mode. + * + * \param manager \c llvm::PassManager to which add the passes + * \param optimize false if \c -cl-opt-disable was given at compilation + * time. + */ + virtual void createOptimizationPasses(llvm::PassManager *manager, + bool optimize, bool hasBarrier=false) = 0; + + /** + * \brief Build a device-specific representation of the program + * + * This function is called by \c Coal::Program::build() when the module + * is compiled and linked. It can be used by the device to build a + * device-specific representation of the program. + * + * \param module \c llvm::Module containing the program's LLVM IR + * \param binary_str \c std::string containing dep.unlinked_binary + * \return true in case of success, false otherwise + */ + virtual bool build(llvm::Module *module, std::string* binary_str) = 0; + + /** + * \brief Extract binaries from MIXED binary + * + * This function is called to extract LLVM bitcode from the native + * binary in the MIXED binary. + * \param binary_str \c std::string containing mixed binary + * \param bitcode \c std::string returns LLVM bitcode if not NULL + * \param native \c std::string returns native binary if not NULL + * \return true if the binary is indeed mixed + */ + virtual bool ExtractMixedBinary(std::string *binary_str, + std::string *bitcode, std::string *native) + { return false; } +}; + +/** + * \brief Device-specific kernel data + */ +class DeviceKernel +{ + public: + DeviceKernel() {} + virtual ~DeviceKernel() {} + + /** + * \brief Maximum work-group size of a kernel + * \return Maximum work-group size of the kernel based on device-specific + * data such as memory usage, register pressure, etc) + */ + virtual size_t workGroupSize() = 0; + + /** + * \brief Local memory used by the kernel + * \return Local memory used by the kernel, in bytes + */ + virtual cl_ulong localMemSize() const = 0; + + /** + * \brief Private memory used by the kernel + * \return Private memory used by the kernel, in bytes + */ + virtual cl_ulong privateMemSize() const = 0; + + /** + * \brief Preferred work-group size multiple + * \return The size multiple a work-group can have to work the best and + * the fastest on the device + */ + virtual size_t preferredWorkGroupSizeMultiple() const = 0; + + /** + * \brief Optimal work-group size + * + * This function allows a device to calculate the optimal work-group size + * for this kernel, using it's memory usage, SIMD dimension, etc. + * + * \c Coal::CPUDevice tries to split the kernel into a number of + * work-groups the closest possible to the number of CPU cores. + * + * \param num_dims Number of working dimensions + * \param dim Dimension for which the multiple is being calculated + * \param global_work_size Total number of work-items to split into + * work-groups + * \return optimal size of a work-group, for the \p dim dimension. + */ + virtual size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim, + size_t global_work_size) const = 0; +}; + +} + +struct _cl_device_id : public Coal::DeviceInterface +{}; + +#endif diff --git a/src/core/dsp/buffer.cpp b/src/core/dsp/buffer.cpp new file mode 100644 index 0000000..72c5419 --- /dev/null +++ b/src/core/dsp/buffer.cpp @@ -0,0 +1,149 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "buffer.h" +#include "device.h" +#include "driver.h" + +#include "CL/cl_ext.h" +#include "../memobject.h" + +#include <cstdlib> +#include <cstring> +#include <iostream> + +using namespace Coal; + +DSPBuffer::DSPBuffer(DSPDevice *device, MemObject *buffer, cl_int *rs) + : DeviceBuffer(), p_device(device), p_buffer(buffer), p_data(0), + p_data_malloced(false), p_buffer_idx(0) +{ + if (buffer->type() != MemObject::SubBuffer && + buffer->flags() & CL_MEM_USE_HOST_PTR) + { + /*--------------------------------------------------------------------- + * We use the host ptr, we are already allocated + *--------------------------------------------------------------------*/ + p_data = (DSPDevicePtr64)(uint64_t)buffer->host_ptr(); + } +} + +DSPBuffer::~DSPBuffer() +{ + if (p_data_malloced) + { + if (p_buffer->flags() & CL_MEM_USE_MSMC_TI) + p_device->free_msmc (p_data); + else p_device->free_global(p_data); + } +} + +DSPDevicePtr64 DSPBuffer::data() const +{ + if (!p_data && p_buffer->type() == MemObject::SubBuffer) + { + /*--------------------------------------------------------------------- + * Data is based on the DSPBuffer of the parent buffer + *--------------------------------------------------------------------*/ + SubBuffer *subbuf = (SubBuffer *)p_buffer; + MemObject *parent = subbuf->parent(); + DSPBuffer *parent_dspbuf = (DSPBuffer *)parent->deviceBuffer(p_device); + + if (!parent_dspbuf->data()) parent_dspbuf->allocate(); + if (!parent_dspbuf->data()) { return 0; } //ERROR() + + return parent_dspbuf->data() + subbuf->offset(); + } + else if (!p_data) ; // ERROR(); + + return p_data; +} + +void *DSPBuffer::nativeGlobalPointer() const +{ + return (void*) (uint64_t) data(); +} + +bool DSPBuffer::allocate() +{ + size_t buf_size = p_buffer->size(); + + /*------------------------------------------------------------------------- + * Something went wrong... + *------------------------------------------------------------------------*/ + if (buf_size == 0) return false; + + if (!p_data && p_buffer->type() == MemObject::SubBuffer) + { + /*--------------------------------------------------------------------- + * Data is based on the DSPBuffer of the parent buffer + *--------------------------------------------------------------------*/ + SubBuffer *subbuf = (SubBuffer *)p_buffer; + MemObject *parent = subbuf->parent(); + DSPBuffer *parent_dspbuf = (DSPBuffer *)parent->deviceBuffer(p_device); + + if (!parent_dspbuf->data()) parent_dspbuf->allocate(); + if (!parent_dspbuf->data()) return false; + + p_data = parent_dspbuf->data() + subbuf->offset(); + return true; + } + + /*------------------------------------------------------------------------- + * We not using a host ptr, allocate a buffer + *------------------------------------------------------------------------*/ + if (!p_data) + { + if (p_buffer->flags() & CL_MEM_USE_MSMC_TI) + p_data = (DSPDevicePtr64) p_device->malloc_msmc(buf_size); + else p_data = (DSPDevicePtr64) p_device->malloc_global(buf_size, false); + + if (!p_data) return false; + + p_data_malloced = true; + } + + if (p_buffer->type() != MemObject::SubBuffer && + p_buffer->flags() & CL_MEM_COPY_HOST_PTR) + Driver::instance()->write(p_device->dspID(), p_data, + (uint8_t*)p_buffer->host_ptr(), buf_size); + + // Say to the memobject that we are allocated + p_buffer->deviceAllocated(this); + + return true; +} + +DeviceInterface *DSPBuffer::device() const +{ + return p_device; +} + +bool DSPBuffer::allocated() const +{ + return p_data != 0; +} diff --git a/src/core/dsp/buffer.h b/src/core/dsp/buffer.h new file mode 100644 index 0000000..b8cb860 --- /dev/null +++ b/src/core/dsp/buffer.h @@ -0,0 +1,61 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#ifndef __DSP_BUFFER_H__ +#define __DSP_BUFFER_H__ + +#include "../deviceinterface.h" +#include "device.h" + +namespace Coal +{ + +class DSPDevice; +class MemObject; + +class DSPBuffer : public DeviceBuffer +{ + public: + DSPBuffer(DSPDevice *device, MemObject *buffer, cl_int *rs); + ~DSPBuffer(); + + bool allocate(); + DeviceInterface *device() const; + DSPDevicePtr64 data() const ; + void *nativeGlobalPointer() const ; + bool allocated() const; + + private: + DSPDevice * p_device; + MemObject * p_buffer; + DSPDevicePtr64 p_data; + bool p_data_malloced; + unsigned int p_buffer_idx; +}; +} +#endif diff --git a/src/core/dsp/cmem.cpp b/src/core/dsp/cmem.cpp new file mode 100644 index 0000000..ee0f938 --- /dev/null +++ b/src/core/dsp/cmem.cpp @@ -0,0 +1,271 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "cmem.h" +#include <deque> +#include <iostream> +#include <cstring> +#include <cstdio> +#include <cstdlib> +#include <sys/stat.h> +#include <string> + +#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); } + +Cmem* Cmem::pInstance = 0; + +/*============================================================================= +* C M E M +*============================================================================*/ +#define CEIL_DIVIDE(x,y) (((x) + (y) - 1) / y) + +/****************************************************************************** +* Thread safe instance function for singleton behavior +******************************************************************************/ +Cmem* Cmem::instance() +{ + static Mutex Cmem_instance_mutex; + Cmem* tmp = pInstance; + + __sync_synchronize(); + + if (tmp == 0) + { + ScopedLock lck(Cmem_instance_mutex); + + tmp = pInstance; + if (tmp == 0) + { + tmp = new Cmem; + __sync_synchronize(); + pInstance = tmp; + } + } + return tmp; +} + +/****************************************************************************** +* Cmem::open() +******************************************************************************/ +void Cmem::open() +{ + int status = cmem_drv_open(); + ERR(status, "DMA Contiguous Memory Driver Open Error"); + + status = cmem_drv_free(0, HOST_BUF_TYPE_DYNAMIC, buf_desc); + ERR(status, "DMA Contiguous Memory Free Error"); + + status = cmem_drv_alloc(MAX_NUM_HOST_DSP_BUFFERS, HOST_CMEM_BUFFER_SIZE, + HOST_BUF_TYPE_DYNAMIC, buf_desc); + ERR(status, "DMA Contiguous Memory Alloc Error"); + + status = bufmgrCreate(&DmaBufPool, MAX_NUM_HOST_DSP_BUFFERS, buf_desc); + ERR(status, "DMA Buffer manager Create Error"); +} + +/****************************************************************************** +* Cmem::close() +******************************************************************************/ +void Cmem::close() +{ + bufmgrDelete(&DmaBufPool); + + int status = cmem_drv_free(MAX_NUM_HOST_DSP_BUFFERS, HOST_BUF_TYPE_DYNAMIC, + buf_desc); + ERR(status, "DMA Contiguous Memory Driver Free Error"); + + status = cmem_drv_close(); + ERR(status, "DMA Contiguous Memory Driver Close Error"); +} + + +/****************************************************************************** +* The dma to the dsp memory system can only occur from contiguous memory, i.e. +* cmem. CMEM buffers are currently limited to 4M, the algorithm is to +* copy the general buffer in 4M chunks into CMEM 4M buffers. Then we are able +* to chain 2 4M buffer writes per DMA initiate. As a result, we will have +* ceil ( size / 8M ) dma transfers initiated by the routine. to make it +* concrete at 48M buffer dma, will result in: +* 12 memcpy calls of 4M each, +* 12 CMEM buffers allocated of 4M each +* 6 dma_initiates each with 2 - 4M buffers +* +* The algorithm is based one the MAX_CONTIGUOUS_XFER_BUFFERS and +* HOST_CMEM_BUFFER_SIZE macros. Currently they are 2 and 4M. +******************************************************************************/ +void Cmem::dma_write(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size) +{ + static uint32_t trans_id = 0; + uint32_t start_trans_id = trans_id; + int32_t ret_val; + std::deque<uint32_t> dma_ids; + + uint32_t simul_dmas = 4; + uint32_t cmem_buffer_size = HOST_CMEM_BUFFER_SIZE; + uint32_t tot_buffers = CEIL_DIVIDE(size, cmem_buffer_size); + uint32_t circ_buffers = std::min(simul_dmas, tot_buffers); + uint32_t last_buffer_size = size - ((tot_buffers-1) * cmem_buffer_size); + + cmem_host_buf_desc_t *host_buf_desc = + new cmem_host_buf_desc_t[circ_buffers]; + + cmem_host_frame_desc_t *host_frame_desc = + new cmem_host_frame_desc_t[circ_buffers]; + + /*--------------------------------------------------------------------- + * Allocate Host CMEM buffers + *--------------------------------------------------------------------*/ + for (int i = 0; i < circ_buffers; i++) + { + ret_val = bufmgrAlloc(DmaBufPool, 1, &host_buf_desc[i]); + ERR(ret_val, "dma buffer allocation failed"); + host_frame_desc[i].bufDescP = &host_buf_desc[i]; + host_frame_desc[i].numBuffers = 1; + host_frame_desc[i].frameStartOffset = 0; + host_frame_desc[i].frameSize = cmem_buffer_size; + } + + /*------------------------------------------------------------------------- + * Initiate one transfer at a time based on what fits within the allowed + * contiguous buffers per DMA transaction + *------------------------------------------------------------------------*/ + for (int i = 0; i < tot_buffers; ++i) + { + int circ_i = i % simul_dmas; + int offset = i * cmem_buffer_size; + + cmem_host_buf_desc_t &buf_desc = host_buf_desc[circ_i]; + uint32_t cpy_size = buf_desc.length; + + if (i == tot_buffers-1) + host_frame_desc[circ_i].frameSize = cpy_size = last_buffer_size; + + memcpy(buf_desc.userAddr, buf + offset, cpy_size); + + /*--------------------------------------------------------------------- + * Initiate DMA + *--------------------------------------------------------------------*/ + ret_val = pciedrv_dma_write_initiate(dsp_id, addr + offset, + &host_frame_desc[circ_i], + PCIEDRV_DMA_XFER_NON_BLOCKING, + &trans_id); + ERR(ret_val, "DMA initiate failed"); + + dma_ids.push_back(trans_id); + + if (dma_ids.size() >= simul_dmas) + { + while (pciedrv_dma_check(dsp_id, dma_ids.front())); + dma_ids.pop_front(); + } + } + + /*--------------------------------------------------------------------- + * Wait for all dmas to complete + *--------------------------------------------------------------------*/ + for (int i = 0; i < dma_ids.size(); i++) + while (pciedrv_dma_check(dsp_id, dma_ids[i])); + + /*--------------------------------------------------------------------- + * Free host CMEM buffers + *--------------------------------------------------------------------*/ + for (int i = 0; i < circ_buffers; i++) + { + ret_val = bufmgrFreeDesc(DmaBufPool, &host_buf_desc[i]); + ERR(ret_val, "dma buffer free failed"); + } + + delete [] host_buf_desc; + delete [] host_frame_desc; +} + +/****************************************************************************** +* Cmem::dma_read +******************************************************************************/ +void Cmem::dma_read(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size) +{ + cmem_host_buf_desc_t host_buf_desc; + cmem_host_frame_desc_t host_frame_desc; + + /*------------------------------------------------------------------------- + * Calculate total number of host buffers required to fit the data + *------------------------------------------------------------------------*/ + uint32_t num_buffers = CEIL_DIVIDE(size, HOST_CMEM_BUFFER_SIZE); + uint32_t remaining_size = size; + uint32_t offset = 0; + uint32_t transfer_size = HOST_CMEM_BUFFER_SIZE; + uint32_t trans_id; + int32_t ret_val; + + /*--------------------------------------------------------------------- + * Allocate Host buffer + *--------------------------------------------------------------------*/ + ret_val = bufmgrAlloc(DmaBufPool, 1, &host_buf_desc); + ERR(ret_val, "dma buffer allocation failed"); + + /*--------------------------------------------------------------------- + * Populate details of data in frame descriptor + *--------------------------------------------------------------------*/ + host_frame_desc.bufDescP = &host_buf_desc; + host_frame_desc.numBuffers = 1; + host_frame_desc.frameStartOffset = 0; + host_frame_desc.frameSize = transfer_size; + + /*------------------------------------------------------------------------- + * Initiate one transfer at a time based on what fits within the allowed + *------------------------------------------------------------------------*/ + while (num_buffers) + { + if (num_buffers == 1) + { + transfer_size = remaining_size; + host_frame_desc.frameSize = transfer_size; + } + + /*--------------------------------------------------------------------- + * Initiate DMA + *--------------------------------------------------------------------*/ + ret_val = pciedrv_dma_read_initiate(dsp_id, addr + offset, + &host_frame_desc, PCIEDRV_DMA_XFER_BLOCKING, &trans_id); + ERR(ret_val, "DMA initiate failed"); + + /*--------------------------------------------------------------------- + * Copy from dma buffers into buffer + *--------------------------------------------------------------------*/ + memcpy (buf + offset, host_buf_desc.userAddr, transfer_size); + + num_buffers--; + offset += transfer_size; + remaining_size -= transfer_size; + } + + /*--------------------------------------------------------------------- + * Free Buffer Descriptors + *--------------------------------------------------------------------*/ + ret_val = bufmgrFreeDesc(DmaBufPool, &host_buf_desc); + ERR(ret_val, "dma buffer free failed"); +} diff --git a/src/core/dsp/cmem.h b/src/core/dsp/cmem.h new file mode 100644 index 0000000..24a6de0 --- /dev/null +++ b/src/core/dsp/cmem.h @@ -0,0 +1,64 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef _CMEM_H +#define _CMEM_H +#include "u_lockable.h" + +extern "C" +{ + #include "pciedrv.h" + #include "cmem_drv.h" + #include "bufmgr.h" +} + +#define HOST_CMEM_BUFFER_SIZE 0x400000 // 4M +#define MAX_NUM_HOST_DSP_BUFFERS 128 + +class Cmem : public Lockable_off +{ + public: + ~Cmem() { close(); } + static Cmem* instance (); + + void open(); + void close(); + void dma_write(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size); + void dma_read (int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size); + + private: + static Cmem* pInstance; + + cmem_host_buf_desc_t buf_desc[MAX_NUM_HOST_DSP_BUFFERS]; + void * DmaBufPool; + + Cmem() : DmaBufPool(NULL) { open(); } + Cmem(const Cmem&); // copy ctor disallowed + Cmem& operator=(const Cmem&); // assignment disallowed +}; + +#endif // _CMEM_H diff --git a/src/core/dsp/core_scheduler.h b/src/core/dsp/core_scheduler.h new file mode 100644 index 0000000..58d0555 --- /dev/null +++ b/src/core/dsp/core_scheduler.h @@ -0,0 +1,62 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "u_lockable.h" +#ifndef _CORE_SCHEDULER_H +#define _CORE_SCHEDULER_H + +class CoreScheduler : public Lockable +{ + public: + CoreScheduler() : p_avail(0xff) {} + + void free(int core) + { + Lock lock(this); + p_avail |= (1 << core); + CV.notify_one(); + } + + int allocate() + { + Lock lock(this); + + /*--------------------------------------------------------------------- + * Wait in a loop in case the condvar is falsely signalled + *--------------------------------------------------------------------*/ + while (!p_avail) CV.wait(lock.raw()); + + for (int i=0, mask = 1; i < 8; ++i, mask <<= 1) + if (p_avail & mask) { p_avail &= ~mask; return i; } + } + + private: + unsigned char p_avail; + CondVar CV; +}; + +#endif //_CORE_SCHEDULER_H diff --git a/src/core/dsp/database.h b/src/core/dsp/database.h new file mode 100644 index 0000000..ca4d69e --- /dev/null +++ b/src/core/dsp/database.h @@ -0,0 +1,112 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __DATABASE_H__ +#define __DATABASE_H__ + +#include <string> +#include <vector> +#include <iostream> +#include <sqlite3.h> + +using namespace std; + +class Database +{ + public: + Database(const char* filename) : database(NULL) { open(filename); } + ~Database() { close(); } + + void close() + { + if (database) sqlite3_close(database); + database = NULL; + } + + vector<vector<string> > query(const char* query) + { + sqlite3_stmt *statement; + vector<vector<string> > results; + const int retry_limit = 20; + int retries = 0; + + int rc = sqlite3_prepare_v2(database, query, -1, &statement, 0); + + while ((rc == SQLITE_BUSY || rc == SQLITE_LOCKED) && + ++retries <= retry_limit) + { + sqlite3_finalize(statement); + usleep(100); + rc = sqlite3_prepare_v2(database, query, -1, &statement, 0); + } + + if (rc == SQLITE_OK) + { + int cols = sqlite3_column_count(statement); + int result = 0; + + while (true) + { + result = sqlite3_step(statement); + + if (result == SQLITE_ROW) + { + vector<string> values; + for (int col = 0; col < cols; col++) + values.push_back((char*)sqlite3_column_text(statement,col)); + results.push_back(values); + } + else break; + } + + sqlite3_finalize(statement); + } + + string error = sqlite3_errmsg(database); + if (error != "not an error") + std::cout << query << " " << error << std::endl; + + return results; + } + + private: + sqlite3 *database; + + private: + bool open(const char* filename) + { + if (sqlite3_open(filename, &database) == SQLITE_OK) + { + sqlite3_busy_timeout(database, 1000); + return true; + } + return false; + } + +}; + +#endif diff --git a/src/core/dsp/device.cpp b/src/core/dsp/device.cpp new file mode 100644 index 0000000..32cd9b0 --- /dev/null +++ b/src/core/dsp/device.cpp @@ -0,0 +1,1135 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "../platform.h" +#include "device.h" +#include "buffer.h" +#include "kernel.h" +#include "program.h" +#include <cstdlib> +#include <algorithm> +#include <limits.h> +#include "CL/cl_ext.h" + +#include <core/config.h> +#include "../propertylist.h" +#include "../commandqueue.h" +#include "../events.h" +#include "../memobject.h" +#include "../kernel.h" +#include "../program.h" +#include "../util.h" + +#include "driver.h" +#include "mailbox.h" + +extern "C" +{ + #include "dload_api.h" + #include <ti/runtime/mmap/include/mmap_resource.h> + +} + +#include <cstring> +#include <cstdlib> +#include <unistd.h> + +#include <algorithm> +#include <iostream> +#include <fstream> +#include <sstream> + +using namespace Coal; + +Mailbox* Mailbox::pInstance = 0; + +/****************************************************************************** +* On DSPC868X the mailboxes are remote on the device DDR. On Hawking the +* mailboxes are in shared DDR +******************************************************************************/ +#ifdef DSPC868X +#define MAILBOX_LOCATION MPM_MAILBOX_MEMORY_LOCATION_REMOTE +#else +#define MAILBOX_LOCATION MPM_MAILBOX_MEMORY_LOCATION_LOCAL + +#include "shmem.h" +unsigned dsp_speed() +{ + const unsigned DSP_PLL = 122880000; + const unsigned pagesize = 0x1000; + + shmem_persistent bootcfg_page; + shmem_persistent clock_page; + + bootcfg_page.configure(0x02620000, pagesize); + clock_page.configure(0x02310000, pagesize); + + char *BOOTCFG_BASE_ADDR = (char*)bootcfg_page.map(0x02620000, pagesize); + char *CLOCK_BASE_ADDR = (char*)clock_page.map(0x02310000, pagesize); + + int MAINPLLCTL0 = (*(int*)(BOOTCFG_BASE_ADDR + 0x350)); + int MULT = (*(int*)(CLOCK_BASE_ADDR + 0x110)); + int OUTDIV = (*(int*)(CLOCK_BASE_ADDR + 0x108)); + + unsigned mult = 1 + ((MULT & 0x3F) | ((MAINPLLCTL0 & 0x7F000) >> 6)); + unsigned prediv = 1 + (MAINPLLCTL0 & 0x3F); + unsigned output_div = 1 + ((OUTDIV >> 19) & 0xF); + unsigned speed = DSP_PLL * mult / prediv / output_div; + + bootcfg_page.unmap(BOOTCFG_BASE_ADDR, pagesize); + clock_page.unmap(CLOCK_BASE_ADDR, pagesize); + + return speed / 1000000; +} +#endif + +/*----------------------------------------------------------------------------- +* Declare our threaded dsp handler function +*----------------------------------------------------------------------------*/ +void *dsp_worker(void* data); +void HOSTwait (unsigned char dsp_id); + +/****************************************************************************** +* DSPDevice::DSPDevice(unsigned char dsp_id) +******************************************************************************/ +DSPDevice::DSPDevice(unsigned char dsp_id) + : DeviceInterface (), + p_cores (8), + p_num_events (0), + p_dsp_mhz (1000), // 1.00 GHz + p_worker (0), + p_rx_mbox (0), + p_tx_mbox (0), + p_stop (false), + p_initialized (false), + p_dsp_id (dsp_id), + p_device_msmc_heap(), + p_device_ddr_heap1(), + p_device_ddr_heap2(), + p_device_ddr_heap3(), + p_device_l2_heap (), + p_dload_handle (0), + p_complete_pending(), + p_mpax_default_res(NULL) +{ + Driver *driver = Driver::instance(); + + void *hdl = driver->reset_and_load(dsp_id); + + p_addr_kernel_config = driver->get_symbol(hdl, "kernel_config_l2"); + p_addr_local_mem = driver->get_symbol(hdl, "ocl_local_mem_start"); + p_addr_mbox_d2h_phys = driver->get_symbol(hdl, "mbox_d2h_phys"); + p_addr_mbox_h2d_phys = driver->get_symbol(hdl, "mbox_h2d_phys"); + p_size_local_mem = driver->get_symbol(hdl, "ocl_local_mem_size"); + p_size_mbox_d2h = driver->get_symbol(hdl, "mbox_d2h_size"); + p_size_mbox_h2d = driver->get_symbol(hdl, "mbox_h2d_size"); + + /*------------------------------------------------------------------------- + * These 4 variables were previously retrieved from the monitor out file. + * They are now determined by query of the CMEM system. + *------------------------------------------------------------------------*/ + //p_addr_global_mem = driver->get_symbol(hdl, "ocl_global_mem_start"); + //p_addr_msmc_mem = driver->get_symbol(hdl, "ocl_msmc_mem_start"); + //p_size_global_mem = driver->get_symbol(hdl, "ocl_global_mem_size"); + //p_size_msmc_mem = driver->get_symbol(hdl, "ocl_msmc_mem_size"); + +#if 0 + // Adjust p_size_global_mem for PG1.0 board, monitor takes 2MB + #define MONITOR_MEM 2 + uint32_t mem_reserve = parse_file_line_value("/proc/cmdline", + "mem_reserve=", 0); + if (mem_reserve > 0 && mem_reserve*1024*1024 < p_size_global_mem) + p_size_global_mem = (mem_reserve - MONITOR_MEM) * 1024 * 1024; + + char *dsp_global_mem_size = getenv("TI_OCL_DSP_GLOBAL_MEM_SIZE"); + if (dsp_global_mem_size) + p_size_global_mem = atol(dsp_global_mem_size); + + // Ordering is important: global in CMEM block 0, msmc in CMEM block 1 + driver->cmem_init(p_addr_global_mem, p_size_global_mem, + p_addr_msmc_mem, p_size_msmc_mem); +#endif + p_addr64_global_mem = 0; + p_size64_global_mem = 0; + p_addr_msmc_mem = 0; + p_size_msmc_mem = 0; + DSPDevicePtr64 global3 = 0; + uint64_t gsize3 = 0; + driver->cmem_init(&p_addr64_global_mem, &p_size64_global_mem, + &p_addr_msmc_mem, &p_size_msmc_mem, + &global3, &gsize3); + + DSPDevicePtr64 global1 = p_addr64_global_mem; + DSPDevicePtr64 global2 = 0; + uint64_t gsize1 = p_size64_global_mem; + uint64_t gsize2 = 0; + driver->split_ddr_memory(p_addr64_global_mem, p_size64_global_mem, + global1, gsize1, global2, gsize2, gsize3); + + driver->shmem_configure(global1, gsize1, 0); + if (gsize2 > 0) driver->shmem_configure(global2, gsize2, 0); + if (gsize3 > 0) driver->shmem_configure(global3, gsize3, 0); + driver->shmem_configure(p_addr_msmc_mem, p_size_msmc_mem, 1); + driver->shmem_configure(p_addr_mbox_d2h_phys, p_size_mbox_d2h); + driver->shmem_configure(p_addr_mbox_h2d_phys, p_size_mbox_h2d); + for (int core=0; core < 8; core++) + driver->shmem_configure(((0x10 + core) << 24) + p_addr_local_mem, + p_size_local_mem); + + driver->free_image_handle(hdl); + + /*------------------------------------------------------------------------- + * Setup the DSP heaps for memory allocation + *------------------------------------------------------------------------*/ + p_device_ddr_heap1.configure(global1, gsize1); + p_device_ddr_heap2.configure(global2, gsize2, true); + p_device_ddr_heap3.configure(global3, gsize3, true); + p_device_l2_heap.configure (p_addr_local_mem, p_size_local_mem); + p_device_msmc_heap.configure(p_addr_msmc_mem, p_size_msmc_mem); + + /*------------------------------------------------------------------------- + * initialize the mailboxes on the cores, so they can receive an exit cmd + *------------------------------------------------------------------------*/ + Mailbox* mb_instance = Mailbox::instance(); + + uint32_t mailboxallocsize = mpm_mailbox_get_alloc_size(); + + p_tx_mbox = (void*)malloc(mailboxallocsize); + p_rx_mbox = (void*)malloc(mailboxallocsize); + + mpm_mailbox_config_t mbConfig; + mbConfig.mem_start_addr = + (uint32_t)driver->map(p_addr_mbox_h2d_phys, p_size_mbox_h2d); + + mbConfig.mem_size = p_size_mbox_h2d; + mbConfig.max_payload_size = mbox_payload; + + int tx_status = mb_instance->create(p_tx_mbox, + NULL, + MAILBOX_LOCATION, + MPM_MAILBOX_DIRECTION_SEND, &mbConfig); + + mbConfig.mem_start_addr = + (uint32_t)driver->map(p_addr_mbox_d2h_phys, p_size_mbox_d2h); + mbConfig.mem_size = p_size_mbox_d2h; + + int rx_status = mb_instance->create(p_rx_mbox, + NULL, + MAILBOX_LOCATION, + MPM_MAILBOX_DIRECTION_RECEIVE, &mbConfig); + + tx_status |= mb_instance->open(p_tx_mbox); + rx_status |= mb_instance->open(p_rx_mbox); + + if (tx_status != 0 || rx_status != 0) + std::cout << "Could not create mailboxes for dsp " + << p_dsp_id << std::endl; + + +#ifdef DSPC868X + char *ghz1 = getenv("TI_OCL_DSP_1_25GHZ"); + if (ghz1) p_dsp_mhz = 1250; // 1.25 GHz +#else + mail_to(frequencyMsg); + + int ret = 0; + do + { + while (!mail_query()) ; + ret = mail_from(); + } while (ret == -1); + + p_dsp_mhz = ret; +#endif + +} + + +/****************************************************************************** +* void DSPDevice::init() +******************************************************************************/ +void DSPDevice::init() +{ + if (p_initialized) return; + + /*------------------------------------------------------------------------- + * Initialize the locking machinery and create worker threads + *------------------------------------------------------------------------*/ + pthread_cond_init(&p_events_cond, 0); + pthread_mutex_init(&p_events_mutex, 0); + pthread_create(&p_worker, 0, &dsp_worker, this); + + p_initialized = true; +} + +/****************************************************************************** +* DSPDevice::~DSPDevice() +******************************************************************************/ +DSPDevice::~DSPDevice() +{ + /*------------------------------------------------------------------------- + * Inform the cores on the device to stop listening for commands + *------------------------------------------------------------------------*/ + mail_to(exitMsg); + + free (p_tx_mbox); + free (p_rx_mbox); + + /*------------------------------------------------------------------------- + * Only need to close the driver for one of the devices + *------------------------------------------------------------------------*/ + if (p_dsp_id == 0) Driver::instance()->close(); + + if (!p_initialized) return; + + /*------------------------------------------------------------------------- + * Terminate the workers and wait for them + *------------------------------------------------------------------------*/ + pthread_mutex_lock(&p_events_mutex); + + p_stop = true; + + pthread_cond_broadcast(&p_events_cond); + pthread_mutex_unlock(&p_events_mutex); + + pthread_join(p_worker, 0); + + pthread_mutex_destroy(&p_events_mutex); + pthread_cond_destroy(&p_events_cond); +} + +/****************************************************************************** +* DeviceBuffer *DSPDevice::createDeviceBuffer(MemObject *buffer) +******************************************************************************/ +DeviceBuffer *DSPDevice::createDeviceBuffer(MemObject *buffer, cl_int *rs) + { return (DeviceBuffer *)new DSPBuffer(this, buffer, rs); } + +/****************************************************************************** +* DeviceProgram *DSPDevice::createDeviceProgram(Program *program) +******************************************************************************/ +DeviceProgram *DSPDevice::createDeviceProgram(Program *program) + { return (DeviceProgram *)new DSPProgram(this, program); } + +/****************************************************************************** +* DeviceKernel *DSPDevice::createDeviceKernel(Kernel *kernel, +******************************************************************************/ +DeviceKernel *DSPDevice::createDeviceKernel(Kernel *kernel, + llvm::Function *function) + { return (DeviceKernel *)new DSPKernel(this, kernel); } + +/****************************************************************************** +* cl_int DSPDevice::initEventDeviceData(Event *event) +******************************************************************************/ +cl_int DSPDevice::initEventDeviceData(Event *event) +{ + switch (event->type()) + { + case Event::MapBuffer: + { + MapBufferEvent *e = (MapBufferEvent*) event; + + if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) + { + e->setPtr((char*)e->buffer()->host_ptr() + e->offset()); + break; + } + + DSPBuffer *buf = (DSPBuffer*) e->buffer()->deviceBuffer(this); + DSPDevicePtr64 data = buf->data() + e->offset(); + + // DO NOT INVALIDATE! Here only initializes host_addr, it cannot + // be used before MapBuffer event is scheduled and processed! + void* host_addr = Driver::instance()->map(data, e->cb(), false); + e->setPtr(host_addr); + break; + } + + case Event::MapImage: break; + + case Event::NDRangeKernel: + case Event::TaskKernel: + { + KernelEvent *e = (KernelEvent *)event; + Program *p = (Program *)e->kernel()->parent(); + DSPProgram *prog = (DSPProgram *)p->deviceDependentProgram(this); + + /*----------------------------------------------------------------- + * Just in time loading + *----------------------------------------------------------------*/ + if (!prog->is_loaded() && !prog->load()) + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + + DSPKernel *dspkernel = (DSPKernel*)e->deviceKernel(); + + cl_int ret = dspkernel->preAllocBuffers(); + if (ret != CL_SUCCESS) return ret; + + // ASW TODO do something + + // Set device-specific data + DSPKernelEvent *dsp_e = new DSPKernelEvent(this, e); + e->setDeviceData((void *)dsp_e); + break; + } + default: break; + } + + return CL_SUCCESS; +} + +/****************************************************************************** +* void DSPDevice::freeEventDeviceData(Event *event) +******************************************************************************/ +void DSPDevice::freeEventDeviceData(Event *event) +{ + switch (event->type()) + { + case Event::NDRangeKernel: + case Event::TaskKernel: + { + DSPKernelEvent *dsp_e = (DSPKernelEvent *)event->deviceData(); + if (dsp_e) delete dsp_e; + } + default: break; + } +} + +/****************************************************************************** +* void DSPDevice::pushEvent(Event *event) +******************************************************************************/ +void DSPDevice::pushEvent(Event *event) +{ + /*------------------------------------------------------------------------- + * Add an event in the list + *------------------------------------------------------------------------*/ + pthread_mutex_lock(&p_events_mutex); + + p_events.push_back(event); + p_num_events++; // Way faster than STL list::size() ! + + pthread_cond_broadcast(&p_events_cond); + pthread_mutex_unlock(&p_events_mutex); +} + +bool DSPDevice::stop() { return p_stop; } +bool DSPDevice::availableEvent() { return p_num_events > 0; } + +/****************************************************************************** +* Event *DSPDevice::getEvent(bool &stop) +******************************************************************************/ +Event *DSPDevice::getEvent(bool &stop) +{ + /*------------------------------------------------------------------------- + * Return the first event in the list, if any. Remove it if it is a + * single-shot event. + *------------------------------------------------------------------------*/ + pthread_mutex_lock(&p_events_mutex); + + while (p_num_events == 0 && !p_stop) + pthread_cond_wait(&p_events_cond, &p_events_mutex); + + if (p_stop) + { + pthread_mutex_unlock(&p_events_mutex); + stop = true; + return 0; + } + + Event *event = p_events.front(); + p_num_events--; + p_events.pop_front(); + + pthread_mutex_unlock(&p_events_mutex); + + return event; +} + +void DSPDevice::push_complete_pending(uint32_t idx, Event* const data) + { p_complete_pending.push(idx, data); } + +bool DSPDevice::get_complete_pending(uint32_t idx, Event*& data) + { return p_complete_pending.try_pop(idx, data); } + +void DSPDevice::dump_complete_pending() { p_complete_pending.dump(); } + +bool DSPDevice::any_complete_pending() { return !p_complete_pending.empty(); } + +/****************************************************************************** +* Device's decision about whether CommandQueue should push more events over +* This number could be tuned (e.g. using ooo example). Note that p_num_events +* are in device's queue, but not yet executed. +******************************************************************************/ +bool DSPDevice::gotEnoughToWorkOn() { return p_num_events > 0; } + +/****************************************************************************** +* Getter functions +******************************************************************************/ +unsigned int DSPDevice::numDSPs() const { return p_cores; } +float DSPDevice::dspMhz() const { return p_dsp_mhz; } +unsigned char DSPDevice::dspID() const { return p_dsp_id; } +DLOAD_HANDLE DSPDevice::dload_handle() const { return p_dload_handle; } + + +int DSPDevice::load(const char *filename) +{ + if (!p_dload_handle) + { + p_dload_handle = DLOAD_create((void*)this); + DLOAD_initialize(p_dload_handle); + } + + FILE *fp = fopen(filename, "rb"); + if (!fp) { printf("can't open OpenCL Program file\n"); exit(1); } + + int prog_handle = DLOAD_load(p_dload_handle, fp); + fclose(fp); + return prog_handle; +} + +bool DSPDevice::unload(int file_handle) +{ + if (p_dload_handle) + return DLOAD_unload(p_dload_handle, file_handle); + return false; +} + +DSPDevicePtr DSPDevice::get_local_scratch(uint32_t &size, uint32_t &block_size) +{ + uint64_t size64; + DSPDevicePtr64 addr64 = p_device_l2_heap.max_block_size(size64, block_size); + size = (uint32_t) size64; + return (DSPDevicePtr) addr64; +} + +DSPDevicePtr DSPDevice::malloc_local(size_t size) + { return p_device_l2_heap.malloc(size,true); } + +void DSPDevice::free_local(DSPDevicePtr addr) + { p_device_l2_heap.free(addr); } + +DSPDevicePtr DSPDevice::malloc_msmc(size_t size) + { return p_device_msmc_heap.malloc(size,true); } + +void DSPDevice::free_msmc(DSPDevicePtr addr) + { p_device_msmc_heap.free(addr); } + +// TODO: examine the flag, the logic, etc +#define FRACTION_PERSISTENT_FOR_BUFFER 8 +DSPDevicePtr64 DSPDevice::malloc_global(size_t size, bool prefer_32bit) +{ + if (prefer_32bit) return p_device_ddr_heap1.malloc(size, true); + + DSPDevicePtr64 addr = 0; + uint64_t size64 = 0; + uint32_t block_size; + p_device_ddr_heap1.max_block_size(size64, block_size); + if (size64 / size > FRACTION_PERSISTENT_FOR_BUFFER) + addr = p_device_ddr_heap1.malloc(size, true); + if (!addr) + // addr = Driver::instance()->cmem_ondemand_malloc(size); + addr = p_device_ddr_heap2.malloc(size, true); + if (!addr) + addr = p_device_ddr_heap3.malloc(size, true); + if (!addr) + addr = p_device_ddr_heap1.malloc(size, true); // give it another chance + return addr; +} + +void DSPDevice::free_global(DSPDevicePtr64 addr) +{ + if (addr < DSP_36BIT_ADDR) + p_device_ddr_heap1.free(addr); + else + // Driver::instance()->cmem_ondemand_free(addr); + if (p_device_ddr_heap2.free(addr) == -1) + p_device_ddr_heap3.free(addr); +} + +void DSPDevice::mail_to(Msg_t &msg) +{ + static unsigned trans_id = 0xC0DE0000; + Mailbox::instance()->write(p_tx_mbox, (uint8_t*)&msg, sizeof(Msg_t), + trans_id++); +} + +bool DSPDevice::mail_query() +{ + return Mailbox::instance()->query(p_rx_mbox); +} + +int DSPDevice::mail_from() +{ + uint32_t size_rx, trans_id_rx; + Msg_t rxmsg; + + Mailbox::instance()->read(p_rx_mbox, (uint8_t*)&rxmsg, &size_rx, + &trans_id_rx); + + if (rxmsg.command == ERROR) + { + printf("%s", rxmsg.u.message); + return -1; + } + + if (rxmsg.command == PRINT) + { + printf("[core %c] %s", rxmsg.u.message[0], rxmsg.u.message+1); + return -1; + } + + return trans_id_rx; +} + +/****************************************************************************** +* void* DSPDevice::get_mpax_default_res, only need to be computed once +******************************************************************************/ +void* DSPDevice::get_mpax_default_res() +{ + if (p_mpax_default_res == NULL) + { + p_mpax_default_res = malloc(sizeof(keystone_mmap_resources_t)); + memset(p_mpax_default_res, 0, sizeof(keystone_mmap_resources_t)); + +#define NUM_VIRT_HEAPS 2 + uint32_t xmc_regs[MAX_XMCSES_MPAXS] = {3, 4, 5, 6, 7, 8, 9}; + uint32_t ses_regs[MAX_XMCSES_MPAXS] = {1, 2, 3, 4, 5, 6, 7}; + uint32_t heap_base[NUM_VIRT_HEAPS] = {0x80000000, 0xC0000000}; + uint32_t heap_size[NUM_VIRT_HEAPS] = {0x20000000, 0x40000000}; + for (int i = 0; i < MAX_XMCSES_MPAXS; i++) + { + xmc_regs[i] = FIRST_FREE_XMC_MPAX + i; + ses_regs[i] = FIRST_FREE_SES_MPAX + i; + } + keystone_mmap_resource_init(MAX_XMCSES_MPAXS, xmc_regs, ses_regs, + NUM_VIRT_HEAPS, heap_base, heap_size, + (keystone_mmap_resources_t *) p_mpax_default_res); + + } + return p_mpax_default_res; +} + +/****************************************************************************** +* cl_int DSPDevice::info +******************************************************************************/ +cl_int DSPDevice::info(cl_device_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union + { + cl_device_type cl_device_type_var; + cl_uint cl_uint_var; + size_t size_t_var; + cl_ulong cl_ulong_var; + cl_bool cl_bool_var; + cl_device_fp_config cl_device_fp_config_var; + cl_device_mem_cache_type cl_device_mem_cache_type_var; + cl_device_local_mem_type cl_device_local_mem_type_var; + cl_device_exec_capabilities cl_device_exec_capabilities_var; + cl_command_queue_properties cl_command_queue_properties_var; + cl_platform_id cl_platform_id_var; + size_t work_dims[MAX_WORK_DIMS]; + }; + + uint64_t maxblock; + uint32_t dummy; + + switch (param_name) + { + case CL_DEVICE_TYPE: + SIMPLE_ASSIGN(cl_device_type, CL_DEVICE_TYPE_ACCELERATOR); + break; + + case CL_DEVICE_VENDOR_ID: + SIMPLE_ASSIGN(cl_uint, 0); + break; + + case CL_DEVICE_MAX_COMPUTE_UNITS: + SIMPLE_ASSIGN(cl_uint, numDSPs()); + break; + + case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: + SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS); + break; + + /*----------------------------------------------------------------- + * Set to local mem size / 128 so that conf basic/local_kernel_def + * can allocate and pass. This allows a long16 for each wi to exist + * in local mem. + *----------------------------------------------------------------*/ + case CL_DEVICE_MAX_WORK_GROUP_SIZE: + SIMPLE_ASSIGN(size_t, 0xffffffff); //p_size_local_mem / 128); + break; + + case CL_DEVICE_MAX_WORK_ITEM_SIZES: + for (int i=0; i<MAX_WORK_DIMS; ++i) + { + work_dims[i] = 0xffffffff; //p_size_local_mem / 128; + } + value_length = MAX_WORK_DIMS * sizeof(size_t); + value = &work_dims; + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: + SIMPLE_ASSIGN(cl_uint, 8); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: + SIMPLE_ASSIGN(cl_uint, 4); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: + SIMPLE_ASSIGN(cl_uint, 1); + break; + + case CL_DEVICE_MAX_CLOCK_FREQUENCY: + SIMPLE_ASSIGN(cl_uint, dspMhz()); + break; + + case CL_DEVICE_ADDRESS_BITS: + SIMPLE_ASSIGN(cl_uint, 32); + break; + + case CL_DEVICE_MAX_READ_IMAGE_ARGS: + SIMPLE_ASSIGN(cl_uint, 0); //images not supported + break; + + case CL_DEVICE_MAX_WRITE_IMAGE_ARGS: + SIMPLE_ASSIGN(cl_uint, 0); // images not supported + break; + + case CL_DEVICE_MAX_MEM_ALLOC_SIZE: + SIMPLE_ASSIGN(cl_ulong, std::min(p_device_ddr_heap1.size(), (cl_ulong)1ul << 30)); + break; + + case CL_DEVICE_IMAGE2D_MAX_WIDTH: + SIMPLE_ASSIGN(size_t, 0); // images not supported + break; + + case CL_DEVICE_IMAGE2D_MAX_HEIGHT: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE3D_MAX_WIDTH: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE3D_MAX_HEIGHT: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE3D_MAX_DEPTH: + SIMPLE_ASSIGN(size_t, 0); //images not supported + break; + + case CL_DEVICE_IMAGE_SUPPORT: + SIMPLE_ASSIGN(cl_bool, CL_FALSE); //images not supported + break; + + case CL_DEVICE_MAX_PARAMETER_SIZE: + SIMPLE_ASSIGN(size_t, 116); // ASW TODO - needs to be 1024 + break; + + case CL_DEVICE_MAX_SAMPLERS: + SIMPLE_ASSIGN(cl_uint, 0); //images not supported + break; + + case CL_DEVICE_MEM_BASE_ADDR_ALIGN: + SIMPLE_ASSIGN(cl_uint, 1024); // 128 byte aligned + break; + + case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: + SIMPLE_ASSIGN(cl_uint, 128); + break; + + case CL_DEVICE_SINGLE_FP_CONFIG: + // Currently don't support CL_FP_DENORM + // ASW TODO: Investigate others + SIMPLE_ASSIGN(cl_device_fp_config, + CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST); + break; + + case CL_DEVICE_DOUBLE_FP_CONFIG: + SIMPLE_ASSIGN(cl_device_fp_config, + CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | + CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM); + break; + + case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: + SIMPLE_ASSIGN(cl_device_mem_cache_type, CL_READ_WRITE_CACHE); + break; + + case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: + SIMPLE_ASSIGN(cl_uint, 128); + break; + + case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: + SIMPLE_ASSIGN(cl_ulong, 128*1024); + break; + + case CL_DEVICE_GLOBAL_MEM_SIZE: + SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap1.size()); + break; + + case CL_DEVICE_GLOBAL_EXT1_MEM_SIZE_TI: + SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap2.size()); + break; + + case CL_DEVICE_GLOBAL_EXT2_MEM_SIZE_TI: + SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap3.size()); + break; + + case CL_DEVICE_MSMC_MEM_SIZE_TI: + SIMPLE_ASSIGN(cl_ulong, p_device_msmc_heap.size()); + break; + + case CL_DEVICE_GLOBAL_MEM_MAX_ALLOC_TI: + p_device_ddr_heap1.max_block_size(maxblock, dummy); + SIMPLE_ASSIGN(cl_ulong, maxblock); + break; + + case CL_DEVICE_GLOBAL_EXT1_MEM_MAX_ALLOC_TI: + p_device_ddr_heap2.max_block_size(maxblock, dummy); + SIMPLE_ASSIGN(cl_ulong, maxblock); + break; + + case CL_DEVICE_GLOBAL_EXT2_MEM_MAX_ALLOC_TI: + p_device_ddr_heap3.max_block_size(maxblock, dummy); + SIMPLE_ASSIGN(cl_ulong, maxblock); + break; + + case CL_DEVICE_MSMC_MEM_MAX_ALLOC_TI: + p_device_msmc_heap.max_block_size(maxblock, dummy); + SIMPLE_ASSIGN(cl_ulong, maxblock); + break; + + case CL_DEVICE_LOCAL_MEM_MAX_ALLOC_TI: + p_device_l2_heap.max_block_size(maxblock, dummy); + SIMPLE_ASSIGN(cl_ulong, maxblock); + break; + + case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: + SIMPLE_ASSIGN(cl_ulong, 64<<10); + break; + + case CL_DEVICE_MAX_CONSTANT_ARGS: + SIMPLE_ASSIGN(cl_uint, 8); + break; + + case CL_DEVICE_LOCAL_MEM_TYPE: + SIMPLE_ASSIGN(cl_device_local_mem_type, CL_LOCAL); + break; + + case CL_DEVICE_LOCAL_MEM_SIZE: + SIMPLE_ASSIGN(cl_ulong, p_device_l2_heap.size()); + break; + + case CL_DEVICE_ERROR_CORRECTION_SUPPORT: + // ASW TODO - check answer + SIMPLE_ASSIGN(cl_bool, CL_FALSE); + break; + + case CL_DEVICE_HOST_UNIFIED_MEMORY: + SIMPLE_ASSIGN(cl_bool, CL_FALSE); + break; + + case CL_DEVICE_PROFILING_TIMER_RESOLUTION: + SIMPLE_ASSIGN(size_t, 1000); // 1000 nanoseconds = 1 microsecond + break; + + case CL_DEVICE_ENDIAN_LITTLE: + SIMPLE_ASSIGN(cl_bool, CL_TRUE); + break; + + case CL_DEVICE_AVAILABLE: + SIMPLE_ASSIGN(cl_bool, CL_TRUE); + break; + + case CL_DEVICE_COMPILER_AVAILABLE: + SIMPLE_ASSIGN(cl_bool, CL_TRUE); + break; + + case CL_DEVICE_EXECUTION_CAPABILITIES: + SIMPLE_ASSIGN(cl_device_exec_capabilities, CL_EXEC_KERNEL); + break; + + case CL_DEVICE_QUEUE_PROPERTIES: + SIMPLE_ASSIGN(cl_command_queue_properties, + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + CL_QUEUE_PROFILING_ENABLE); + break; + + case CL_DEVICE_NAME: + // ASW TODO add device number suffix +#ifdef DSPC868X + STRING_ASSIGN("TI TMS320C6678 DSP"); +#else + STRING_ASSIGN("TI K2H DSP (8x C66)"); +#endif + break; + + case CL_DEVICE_VENDOR: + STRING_ASSIGN("Texas Instruments, Inc."); + break; + + case CL_DRIVER_VERSION: + STRING_ASSIGN("" COAL_VERSION); + break; + + case CL_DEVICE_PROFILE: + STRING_ASSIGN("FULL_PROFILE"); + break; + + case CL_DEVICE_VERSION: + STRING_ASSIGN("OpenCL 1.1 TI " COAL_VERSION); + break; + + case CL_DEVICE_EXTENSIONS: + STRING_ASSIGN("cl_khr_byte_addressable_store" + " cl_khr_global_int32_base_atomics" + " cl_khr_global_int32_extended_atomics" + " cl_khr_local_int32_base_atomics" + " cl_khr_local_int32_extended_atomics" + " cl_khr_fp64" + " cl_ti_msmc_buffers") + break; + + case CL_DEVICE_PLATFORM: + SIMPLE_ASSIGN(cl_platform_id, &the_platform); + break; + + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: + SIMPLE_ASSIGN(cl_uint, 0); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR: + SIMPLE_ASSIGN(cl_uint, 8); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT: + SIMPLE_ASSIGN(cl_uint, 4); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: + SIMPLE_ASSIGN(cl_uint, 2); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: + SIMPLE_ASSIGN(cl_uint, 1); + break; + + case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF: + SIMPLE_ASSIGN(cl_uint, 0); + break; + + case CL_DEVICE_OPENCL_C_VERSION: + STRING_ASSIGN("OpenCL C 1.1 LLVM " LLVM_VERSION); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +/****************************************************************************** +* Call back functions from the target loader +******************************************************************************/ +extern "C" +{ + +/*****************************************************************************/ +/* DLIF_ALLOCATE() - Return the load address of the segment/section */ +/* described in its parameters and record the run address in */ +/* run_address field of DLOAD_MEMORY_REQUEST. */ +/*****************************************************************************/ +BOOL DLIF_allocate(void* client_handle, struct DLOAD_MEMORY_REQUEST *targ_req) +{ + DSPDevice* device = (DSPDevice*) client_handle; + + /*------------------------------------------------------------------------*/ + /* Get pointers to API segment and file descriptors. */ + /*------------------------------------------------------------------------*/ + struct DLOAD_MEMORY_SEGMENT* obj_desc = targ_req->segment; + + uint32_t addr; + + if (obj_desc->target_address >> 20 == 0x008) + addr = (uint32_t)device->malloc_local (obj_desc->memsz_in_bytes); + else if (obj_desc->target_address >> 24 == 0x0C) + addr = (uint32_t)device->malloc_msmc (obj_desc->memsz_in_bytes); + else addr = (uint32_t)device->malloc_global(obj_desc->memsz_in_bytes); + +#if DEBUG + printf("DLIF_allocate: %d bytes starting at 0x%x (relocated from 0x%x)\n", + obj_desc->memsz_in_bytes, (uint32_t)addr, + (uint32_t)obj_desc->target_address); +#endif + + obj_desc->target_address = (TARGET_ADDRESS) addr; + + /*------------------------------------------------------------------------*/ + /* Target memory request was successful. */ + /*------------------------------------------------------------------------*/ + return addr == 0 ? 0 : 1; +} + +/*****************************************************************************/ +/* DLIF_RELEASE() - Unmap or free target memory that was previously */ +/* allocated by DLIF_allocate(). */ +/*****************************************************************************/ +BOOL DLIF_release(void* client_handle, struct DLOAD_MEMORY_SEGMENT* ptr) +{ + DSPDevice* device = (DSPDevice*) client_handle; + + if (ptr->target_address >> 20 == 0x008) + device->free_local ((DSPDevicePtr)ptr->target_address); + else if (ptr->target_address >> 24 == 0x0C) + device->free_msmc ((DSPDevicePtr)ptr->target_address); + else device->free_global((DSPDevicePtr)ptr->target_address); + +#if DEBUG + printf("DLIF_free: %d bytes starting at 0x%x\n", + ptr->memsz_in_bytes, (uint32_t)ptr->target_address); +#endif + + return 1; +} + +/*****************************************************************************/ +/* DLIF_WRITE() - Write updated (relocated) segment contents to target */ +/* memory. */ +/*****************************************************************************/ +BOOL DLIF_write(void* client_handle, struct DLOAD_MEMORY_REQUEST* req) +{ + struct DLOAD_MEMORY_SEGMENT* obj_desc = req->segment; + DSPDevice* device = (DSPDevice*) client_handle; + int dsp_id = device->dspID(); + + Driver::instance()->write (dsp_id, + (uint32_t)obj_desc->target_address, + (uint8_t*)req->host_address, + obj_desc->memsz_in_bytes); + +#if DEBUG + printf("DLIF_write (dsp:%d): %d bytes starting at 0x%x\n", + dsp_id, obj_desc->memsz_in_bytes, + (uint32_t)obj_desc->target_address); +#endif + + extern DSPProgram::segment_list *segments; + + if (segments) segments->push_back + (DSPProgram::seg_desc((DSPDevicePtr)obj_desc->target_address, obj_desc->memsz_in_bytes, req->flags)); + + return 1; +} + +/****************************************************************************** +* DLIF_LOAD_DEPENDENT() +******************************************************************************/ +int DLIF_load_dependent(void* client_handle, const char* so_name) +{ + DSPDevice* device = (DSPDevice*) client_handle; + FILE* fp = fopen(so_name, "rb"); + + if (!fp) + { + DLIF_error(DLET_FILE, "Can't open dependent file '%s'.\n", so_name); + return 0; + } + + int to_ret = DLOAD_load(device->dload_handle(), fp); + + if (!to_ret) + DLIF_error(DLET_MISC, "Failed load of dependent file '%s'.\n", so_name); + + fclose(fp); + return to_ret; +} + +/****************************************************************************** +* DLIF_UNLOAD_DEPENDENT() +******************************************************************************/ +void DLIF_unload_dependent(void* client_handle, uint32_t file_handle) +{ + DSPDevice* device = (DSPDevice*) client_handle; + DLOAD_unload(device->dload_handle(), file_handle); +} + +} + +void dump_hex(char *addr, int bytes) +{ + int cnt = 0; + + printf("\n"); + while (cnt < bytes) + { + for (int col = 0; col < 16; ++col) + { + printf("%02x ", addr[cnt++] & 0xff); + if (cnt >= bytes) break; + } + printf("\n"); + } +} + diff --git a/src/core/dsp/device.h b/src/core/dsp/device.h new file mode 100644 index 0000000..4a6f32a --- /dev/null +++ b/src/core/dsp/device.h @@ -0,0 +1,151 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __DSP_DEVICE_H__ +#define __DSP_DEVICE_H__ + +extern "C" { +#include "dload_api.h" +} + +#include "../deviceinterface.h" +#include "dspheap.h" +#include "message.h" +#include "u_concurrent_map.h" +#include "kernel.h" +#include <pthread.h> +#include <string> +#include <list> + +namespace Coal +{ + +class MemObject; +class Event; +class Program; +class Kernel; + +class DSPDevice : public DeviceInterface +{ + public: + DSPDevice(unsigned char dsp_id); + ~DSPDevice(); + + void init(); + + cl_int info(cl_device_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs); + DeviceProgram *createDeviceProgram(Program *program); + DeviceKernel *createDeviceKernel(Kernel *kernel, + llvm::Function *function); + + cl_int initEventDeviceData(Event *event); + void freeEventDeviceData(Event *event); + + void pushEvent(Event *event); + bool stop(); + bool availableEvent(); + Event *getEvent(bool &stop); + + unsigned int numDSPs() const; + float dspMhz() const; + unsigned char dspID() const; + DLOAD_HANDLE dload_handle() const; + + int load(const char *filename); + bool unload(int file_handle); + + /*--------------------------------------------------------------------- + * These malloc routines return a uint32_t instead of a pointer + * Because the target memory space is 32 bit and is independent of the + * size of a host pointer (ie. 32bit vs 64 bit) + * Device/Target global memory could be 36-bit. + * get_local_scratch returns max local free block for per kernel use. + *--------------------------------------------------------------------*/ + DSPDevicePtr get_local_scratch(uint32_t &size, uint32_t &block_size); + DSPDevicePtr malloc_local (size_t size); + void free_local (DSPDevicePtr add); + DSPDevicePtr malloc_msmc (size_t size); + void free_msmc (DSPDevicePtr add); + DSPDevicePtr64 malloc_global(size_t size, bool prefer_32bit=true); + void free_global (DSPDevicePtr64 add); + + void mail_to (Msg_t& msg); + bool mail_query(); + int mail_from (); + + void push_complete_pending(uint32_t idx, class Event* const data); + bool get_complete_pending(uint32_t idx, class Event* &data); + void dump_complete_pending(); + bool any_complete_pending(); + bool gotEnoughToWorkOn(); + + std::string builtinsHeader(void) const { return "dsp.h"; } + + DSPDevicePtr get_addr_kernel_config() { return p_addr_kernel_config; } + void* get_mpax_default_res(); + + private: + unsigned int p_cores; + unsigned int p_num_events; + float p_dsp_mhz; + pthread_t p_worker; + void* p_rx_mbox; // int + void* p_tx_mbox; + std::list<Event *> p_events; + pthread_cond_t p_events_cond; + pthread_mutex_t p_events_mutex; + bool p_stop; + bool p_initialized; + unsigned char p_dsp_id; + dspheap p_device_ddr_heap1; // persistently mapped memory + dspheap p_device_ddr_heap2; // ondemand mapped memory + dspheap p_device_ddr_heap3; // addl ondemand mapped memory + dspheap p_device_l2_heap; + dspheap p_device_msmc_heap; + DLOAD_HANDLE p_dload_handle; + concurrent_map<uint32_t, class Event*> p_complete_pending; + + DSPDevicePtr p_addr_kernel_config; + DSPDevicePtr64 p_addr64_global_mem; + DSPDevicePtr p_addr_local_mem; + DSPDevicePtr p_addr_msmc_mem; + DSPDevicePtr p_addr_mbox_d2h_phys; + DSPDevicePtr p_addr_mbox_h2d_phys; + uint64_t p_size64_global_mem; + uint32_t p_size_local_mem; + uint32_t p_size_msmc_mem; + uint32_t p_size_mbox_d2h; + uint32_t p_size_mbox_h2d; + void* p_mpax_default_res; +}; +} +#endif diff --git a/src/core/dsp/driver.cpp b/src/core/dsp/driver.cpp new file mode 100644 index 0000000..08e97f7 --- /dev/null +++ b/src/core/dsp/driver.cpp @@ -0,0 +1,34 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifdef DSPC868X +#include "driver_shannon.cpp" +#include "cmem.cpp" +#else +#include "driver_hawking.cpp" +#include "shmem.cpp" +#endif diff --git a/src/core/dsp/driver.h b/src/core/dsp/driver.h new file mode 100644 index 0000000..1e41a28 --- /dev/null +++ b/src/core/dsp/driver.h @@ -0,0 +1,100 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef _DRIVER_H +#define _DRIVER_H +#include <vector> +#include "u_lockable.h" +#include "device.h" + +#ifdef DSPC868X +extern "C" +{ + #include "pciedrv.h" + #include "dnldmgr.h" + #include "cmem_drv.h" + #include "bufmgr.h" +} +#else +#include "shmem.h" +#endif + +class Driver : public Lockable_off +{ + public: + ~Driver() { close(); } + int32_t num_dsps() const { return pNum_dsps; } + int32_t close(); + + int32_t write(int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf, uint32_t sz); + int32_t read (int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf, uint32_t sz); + + void* reset_and_load (int chip); + void free_image_handle(void *handle); + void cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1, + DSPDevicePtr *addr2, uint32_t *size2, + DSPDevicePtr64 *addr3, uint64_t *size3); + void cmem_exit(); + DSPDevicePtr64 cmem_ondemand_malloc(uint64_t size); + void cmem_ondemand_free (DSPDevicePtr64 addr); + void split_ddr_memory (DSPDevicePtr64 addr, uint64_t size, + DSPDevicePtr64& addr1, uint64_t& size1, + DSPDevicePtr64& addr2, uint64_t& size2, + uint64_t& size3); + void shmem_configure (DSPDevicePtr64 addr, uint64_t size, + int cmem_block = -1); + void* map (DSPDevicePtr64 addr, uint32_t sz, + bool is_read = false); + int32_t unmap (void *host_addr, DSPDevicePtr64 buf_addr, + uint32_t sz, bool is_write = false); + DSPDevicePtr get_symbol(void* image_handle, const char *name); + + static Driver* instance (); + + private: + static Driver* pInstance; + int32_t pNum_dsps; + +#ifdef DSPC868X + pciedrv_open_config_t config; + pciedrv_device_info_t *pDevices_info; +#else + std::vector<shmem*> pShmem_areas; + shmem* get_memory_region(DSPDevicePtr64 addr); +#endif + + int32_t open (); + bool wait_for_ready(int chip); + int32_t write_core(int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf, + uint32_t sz); + + Driver() { open(); } + Driver(const Driver&); // copy ctor disallowed + Driver& operator=(const Driver&); // assignment disallowed +}; + +#endif // _DRIVER_H diff --git a/src/core/dsp/driver_hawking.cpp b/src/core/dsp/driver_hawking.cpp new file mode 100644 index 0000000..7cb2857 --- /dev/null +++ b/src/core/dsp/driver_hawking.cpp @@ -0,0 +1,451 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "driver.h" +#include <deque> +#include <iostream> +#include <cstring> +#include <cstdio> +#include <cstdlib> +#include <sys/stat.h> +#include <string> +#include <bfd.h> + +extern "C" +{ + #include "mpmclient.h" +}; + + +#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); } +#define BOOT_ENTRY_LOCATION_ADDR 0x87FFFC +#define BOOT_MAGIC_ADDR(core) (0x10000000 | (core << 24) | 0x87FFFC) + +Driver* Driver::pInstance = 0; + +/****************************************************************************** +* Thread safe instance function for singleton behavior +******************************************************************************/ +Driver* Driver::instance () +{ + static Mutex Driver_instance_mutex; + Driver* tmp = pInstance; + + __sync_synchronize(); + + if (tmp == 0) + { + ScopedLock lck(Driver_instance_mutex); + + tmp = pInstance; + if (tmp == 0) + { + tmp = new Driver; + __sync_synchronize(); + pInstance = tmp; + } + } + return tmp; +} + +/****************************************************************************** +* Convert pci data into a recognizable board name for a device +******************************************************************************/ +const char *get_board(unsigned switch_device) +{ + switch (switch_device) + { + case 0x8624: return "dspc8681"; + case 0x8748: return "dspc8682"; + default : ERR(1, "Unsupported device"); return "unknown"; + } +} + +#define TOTAL_NUM_CORES_PER_CHIP 8 + +/****************************************************************************** +* wait_for_ready +******************************************************************************/ +bool Driver::wait_for_ready(int chip) { return true; } + +static void report_core_state(const char *curr_core) +{ +#if 0 + char state[50]; + int ret; + mpm_slave_state_e core_state; + + ret = mpm_state(curr_core, &core_state); + if ( ret < 0) + printf("state query failed, %s\n", curr_core); + + switch (core_state) + { + case mpm_slave_state_idle: sprintf(state, "idle"); break; + case mpm_slave_state_loaded: sprintf(state, "loaded"); break; + case mpm_slave_state_running: sprintf(state, "running"); break; + case mpm_slave_state_crashed: sprintf(state, "crashed"); break; + case mpm_slave_state_error: sprintf(state, "in error"); break; + + default: sprintf(state, "in undefined state"); break; + } + + printf("DSP core state: %s is %s\n", curr_core, state); +#endif +} + +void *Driver::reset_and_load(int chip) +{ + int ret; + int error_code = 0; + int error_code_msg[50]; + char curr_core[10]; + + std::string get_ocl_dsp(); + std::string monitor = get_ocl_dsp() + "/dsp.out"; + + for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++) + { + snprintf(curr_core, 5, "dsp%d", core); + + ret = mpm_reset(curr_core, &error_code); + if ( ret < 0) + printf("reset failed, core %d (retval: %d, error: %d)\n", + core, ret, error_code); +// JKN Update ERR to handle error_code + ERR (ret, "DSP out of reset failed"); + + report_core_state(curr_core); + } + + /*------------------------------------------------------------------------- + * Load monitor on the devices + *------------------------------------------------------------------------*/ + for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++) + { + snprintf(curr_core, 5,"dsp%d", core); + ret = mpm_load(curr_core, const_cast<char*>(monitor.c_str()), + &error_code); + if ( ret < 0) + printf("load failed, core %d (retval: %d, error: %d)\n", + core, ret, error_code); + ERR(ret, "Download image failed"); + + report_core_state(curr_core); + } + + /*------------------------------------------------------------------------- + * Run monitor on the devices + *------------------------------------------------------------------------*/ + for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++) + { + snprintf(curr_core, 5,"dsp%d", core); + ret = mpm_run(curr_core, &error_code); + if ( ret < 0) + printf("run failed, core %d (retval: %d, error: %d)\n", + core, ret, error_code); + ERR(ret, "DSP run failed"); + + report_core_state(curr_core); + } + + bfd *dsp_bfd = bfd_openr(monitor.c_str(), NULL); + char** matching; + char *ptr; + + if(dsp_bfd == NULL) + { + printf("\nERROR:driver: %s Error Open image %s\n", + bfd_errmsg(bfd_get_error()), monitor.c_str()); + exit(-1); + } + /* Check format with matching */ + if (!bfd_check_format_matches (dsp_bfd, bfd_object, &matching)) + { + fprintf(stderr, "\nERROR:driver %s: %s\n", monitor.c_str(), + bfd_errmsg(bfd_get_error())); + if (bfd_get_error () == bfd_error_file_ambiguously_recognized) + { + for (ptr = *matching; ptr != NULL; ptr++) + { + printf("%s: \n", ptr); + exit(-1); + } + free (matching); + } + } + + return (void *)dsp_bfd; +} + +/****************************************************************************** +* Driver::open +******************************************************************************/ +int32_t Driver::open() +{ + Lock lock(this); + + pNum_dsps = 1; + + return 0; +} + +/****************************************************************************** +* Driver::close() +******************************************************************************/ +int32_t Driver::close() +{ + Lock lock(this); + + while (!pShmem_areas.empty()) delete pShmem_areas.back(), pShmem_areas.pop_back(); + + cmem_exit(); + return 0; +} + +void Driver::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1, + DSPDevicePtr *addr2, uint32_t *size2, + DSPDevicePtr64 *addr3, uint64_t *size3) +{ + shmem_cmem_persistent::cmem_init(addr1, size1, addr2, size2, addr3, size3); +} + +void Driver::cmem_exit() +{ + shmem_cmem_persistent::cmem_exit(); +} + +DSPDevicePtr64 Driver::cmem_ondemand_malloc(uint64_t size) +{ + return shmem_cmem_ondemand::cmem_malloc(size); +} + +void Driver::cmem_ondemand_free(DSPDevicePtr64 addr) +{ + shmem_cmem_ondemand::cmem_free(addr); +} + +/****************************************************************************** +* Driver::split_ddr_heap: partition DDR to persistent mapping part (heap1) +* and on demand mapping part (heap2) +******************************************************************************/ +void Driver::split_ddr_memory(DSPDevicePtr64 addr, uint64_t size, + DSPDevicePtr64& addr1, uint64_t& size1, + DSPDevicePtr64& addr2, uint64_t& size2, + uint64_t& size3) +{ + addr1 = addr; + size1 = size; + addr2 = 0; + size2 = 0; + + + // split ddr memory 1 into two chunks + if (getenv("TI_OCL_DSP_NOMAP") != NULL) + { + size3 = 0; + } + else if (addr + size > ALL_PERSISTENT_MAX_DSP_ADDR || + (size3 > 0 && addr + size > MPAX_USER_MAPPED_DSP_ADDR)) + { + size2 = addr + size - MPAX_USER_MAPPED_DSP_ADDR; + size1 = size - size2; + addr2 = addr + size1; + } + + // translate first chunk to using 32-bit aliased physical addresses + if (addr > DSP_36BIT_ADDR) + { + addr1 = addr + 0xA0000000 - 0x820000000ULL; + /*--------------------------------------------------------------------- + * if the ddr size is greater than we can currently support, limit it + *--------------------------------------------------------------------*/ + //const int ddr_size_limit = (1.5 * 1024*1024*1024) - (48 *1024*1024); + const uint64_t ddr_size_limit = ALL_PERSISTENT_MAX_DSP_ADDR - addr; + if (size1 > ddr_size_limit) + size1 = ddr_size_limit; + } +} + +void Driver::shmem_configure(DSPDevicePtr64 addr, uint64_t size, int cmem_block) +{ + if (size <= 0) return; + + shmem *area; + if (addr >= MPAX_USER_MAPPED_DSP_ADDR) + area = new shmem_cmem_ondemand(); + else if (cmem_block >= 0) + area = new shmem_cmem_persistent(cmem_block); + else + area = new shmem_persistent(); + + area->configure(addr, size); + pShmem_areas.push_back(area); +} + +/****************************************************************************** +* Driver::get_memory_region +******************************************************************************/ +shmem* Driver::get_memory_region(DSPDevicePtr64 addr) +{ + + for (int i = 0; i < pShmem_areas.size(); ++i) + { + uint64_t end_exclusive = (uint64_t)pShmem_areas[i]->start() + + pShmem_areas[i]->size(); + + if (addr >= pShmem_areas[i]->start() && addr < end_exclusive) + return pShmem_areas[i]; + } + + printf("Illegal memory region: addr = 0x%llx\n", addr); + exit(-1); +} + + +/****************************************************************************** +* Driver::write +******************************************************************************/ +int32_t Driver::write(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf, + uint32_t size) +{ + int core; + /*------------------------------------------------------------------------- + * if the write is to L2, then write for each core + *------------------------------------------------------------------------*/ + if ((addr >> 20) == 0x008) + for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++) + write_core(dsp_id, ((0x10 + core) << 24) + addr, buf, size); + else write_core(dsp_id, addr, buf, size); +} + +/****************************************************************************** +* Driver::write_core +******************************************************************************/ +int32_t Driver::write_core(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf, + uint32_t size) +{ + Lock lock(this); + + shmem* region = get_memory_region(addr); + void* dst_host_addr = region->map(addr, size, false); + if (dst_host_addr) memcpy((char*)dst_host_addr, buf, size); + else ERR(1, "Unable to map dsp addr for write"); + region->unmap(dst_host_addr, size, true); + + return 0; +} + +void* Driver::map(DSPDevicePtr64 addr, uint32_t sz, bool is_read) +{ + Lock lock(this); + shmem* region = get_memory_region(addr); + void* host_addr = region->map(addr, sz, is_read); + if (host_addr == NULL) ERR(1, "Unable to map a dsp address"); + return host_addr; +} + +int32_t Driver::unmap(void *host_addr, DSPDevicePtr64 buf_addr, uint32_t sz, + bool is_write) +{ + Lock lock(this); + shmem* region = get_memory_region(buf_addr); + region->unmap(host_addr, sz, is_write); + return 0; +} + +/****************************************************************************** +* Driver::read +******************************************************************************/ +int32_t Driver::read(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf, + uint32_t size) +{ + Lock lock(this); + + shmem* region = get_memory_region(addr); + void* dst_host_addr = region->map(addr, size, true); + if (dst_host_addr) memcpy(buf, (char*)dst_host_addr, size); + else ERR(1, "Unable to map dsp addr for read"); + region->unmap(dst_host_addr, size, false); + + return 0; +} + +/****************************************************************************** +* Driver::free_image_handle +******************************************************************************/ +void Driver::free_image_handle(void *handle) +{ + bfd_close((bfd*)handle); +} + +/****************************************************************************** +* Driver::get_symbol +******************************************************************************/ +DSPDevicePtr Driver::get_symbol(void* image_handle, const char *name) +{ + DSPDevicePtr addr; + bfd* dsp_bfd; + uint32_t nsyms, nsize; + asymbol ** symtab; + symbol_info syminfo; + int i; + + if (!image_handle) + { + std::cout << "ERROR: Failed to get image handle" << std::endl; + exit(-1); + } + + dsp_bfd = (bfd *)image_handle; + + /*------------------------------------------------------------------------- + * Find boot address and address of mpi_rank. + *------------------------------------------------------------------------*/ + nsize = bfd_get_symtab_upper_bound (dsp_bfd); + if ((symtab = (asymbol**)malloc(nsize)) == NULL) + { + std::cout << "ERROR: Failed to malloc memory in get_symbol" << std::endl; + exit(-1); + } + + nsyms = bfd_canonicalize_symtab(dsp_bfd, symtab); + + for (i = 0; i < nsyms; i++) + if (strcmp(symtab[i]->name, name) == 0) + { + bfd_symbol_info(symtab[i], &syminfo); + DSPDevicePtr addr = syminfo.value; + free(symtab); + + return addr; + } + + free(symtab); + std::cout << "ERROR: Get symbol failed" << std::endl; + exit(-1); +} diff --git a/src/core/dsp/driver_shannon.cpp b/src/core/dsp/driver_shannon.cpp new file mode 100644 index 0000000..b428dbb --- /dev/null +++ b/src/core/dsp/driver_shannon.cpp @@ -0,0 +1,313 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "driver.h" +#include "cmem.h" +#include <deque> +#include <iostream> +#include <cstring> +#include <cstdio> +#include <cstdlib> +#include <sys/stat.h> +#include <string> + +#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); } +#define BOOT_ENTRY_LOCATION_ADDR 0x87FFFC +#define BOOT_MAGIC_ADDR(core) (0x10000000 | (core << 24) | 0x87FFFC) + +Driver* Driver::pInstance = 0; + +/****************************************************************************** +* Thread safe instance function for singleton behavior +******************************************************************************/ +Driver* Driver::instance () +{ + static Mutex Driver_instance_mutex; + Driver* tmp = pInstance; + + __sync_synchronize(); + + if (tmp == 0) + { + ScopedLock lck(Driver_instance_mutex); + + tmp = pInstance; + if (tmp == 0) + { + tmp = new Driver; + __sync_synchronize(); + pInstance = tmp; + } + } + return tmp; +} + +/****************************************************************************** +* Convert pci data into a recognizable board name for a device +******************************************************************************/ +const char *get_board(unsigned switch_device) +{ + switch (switch_device) + { + case 0x8624: return "dspc8681"; + case 0x8748: return "dspc8682"; + default : ERR(1, "Unsupported device"); return "unknown"; + } +} + +#define TOTAL_NUM_CORES_PER_CHIP 8 + +/****************************************************************************** +* wait_for_ready +******************************************************************************/ +bool Driver::wait_for_ready(int chip) +{ + int execution_wait_count = 0; + while (1) + { + int core; + for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++) + { + uint32_t boot_entry_value; + int ret = pciedrv_dsp_read(chip, + ((0x10 + core) << 24) + BOOT_ENTRY_LOCATION_ADDR, + (unsigned char *) &boot_entry_value, 4); + ERR(ret, "pciedrv_dsp_read failed"); + + if (boot_entry_value != 0) break; + } + + if (core == TOTAL_NUM_CORES_PER_CHIP) return true; + if (++execution_wait_count > 1000) return false; + + usleep(1000); + } +} + +char *get_ocl_install(); +void *Driver::reset_and_load(int chip) +{ + char *installation = get_ocl_install(); + + /*------------------------------------------------------------------------ + * Determine DSP speed. 1 Ghz by default. Set Env Var for 1.25Ghz Oper + *-----------------------------------------------------------------------*/ + uint32_t pll_multiplier = 0x00000014; // 1.00 Ghz by default + if (getenv("TI_OCL_DSP_1_25GHZ")) pll_multiplier = 0x00000019; + + /*------------------------------------------------------------------------- + * Configure boot config + *------------------------------------------------------------------------*/ + uint32_t bootcfg_words[]= { 0xBABEFACE, pll_multiplier }; + boot_cfg_t bootcfg = { 0x86FF00, sizeof(bootcfg_words), bootcfg_words}; + + /*------------------------------------------------------------------------- + * reset the devices + *------------------------------------------------------------------------*/ + int ret = dnldmgr_reset_dsp(chip, 0, NULL, 0 , NULL); + ERR (ret, "DSP putting in reset failed"); + + const char *board = get_board(pDevices_info[chip].switch_device); + std::string init(installation); + init += "/lib/init_"; + init += board; + init += ".out"; + + void * image_handle; + uint32_t entry; + + ret = dnldmgr_get_image(init.c_str(), &image_handle, &entry); + ERR(ret, "Get reset image failed"); + + ret = dnldmgr_reset_dsp(chip, 1, image_handle, entry, &bootcfg); + ERR (ret, "DSP out of reset failed"); + + dnldmgr_free_image(image_handle); + + /*--------------------------------------------------------------------- + * wait for reset to complete + *--------------------------------------------------------------------*/ + ERR(!wait_for_ready(chip), "Reset Failed due to timeout"); + + /*------------------------------------------------------------------------- + * Load monitor on the devices + *------------------------------------------------------------------------*/ + std::string monitor(installation); + monitor += "/lib/dsp.out"; + + ret = dnldmgr_get_image(monitor.c_str(), &image_handle, &entry); + ERR(ret, "Get DSP image failed"); + + ret = dnldmgr_load_image(chip, 0xFFFF, image_handle, entry, NULL); + ERR(ret, "Download image failed"); + + return image_handle; +} + +/****************************************************************************** +* Driver::open +******************************************************************************/ +int32_t Driver::open() +{ + Lock lock(this); + + memset((void*)&config, 0, sizeof(pciedrv_open_config_t)); + config.dsp_outbound_reserved_mem_size = 0; + config.start_dma_chan_num = 0; + config.num_dma_channels = 4; + config.start_param_set_num = 0; + config.num_param_sets = 32; + config.dsp_outbound_block_size = 0x400000; + config.max_dma_transactions = 256; + + int status = pciedrv_open(&config); + ERR(status, "PCIe Driver Open Error"); + + pNum_dsps = pciedrv_get_num_devices(); + + /*------------------------------------------------------------------------- + * Allocate space for and retrieve device info + *------------------------------------------------------------------------*/ + pDevices_info = (pciedrv_device_info_t*) + malloc(pNum_dsps * sizeof(pciedrv_device_info_t)); + ERR (!pDevices_info, "malloc failed pciedrv_devices_info_t"); + + int ret = pciedrv_get_pci_info(pDevices_info); + ERR(ret, "get pci info failed"); + + Cmem::instance(); // Prime the setup of cmem + return 0; +} + +/****************************************************************************** +* Driver::close() +******************************************************************************/ +int32_t Driver::close() +{ + Lock lock(this); + free (pDevices_info); + int status = pciedrv_close(); + ERR(status, "PCIe Driver Close Error"); + return 0; +} + + +/****************************************************************************** +* Driver::write +******************************************************************************/ +int32_t Driver::write(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf, + uint32_t size) +{ + int core; + /*------------------------------------------------------------------------- + * if the write is to L2, then write for each core + *------------------------------------------------------------------------*/ + if ((addr >> 20) == 0x008) + for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++) + write_core(dsp_id, ((0x10 + core) << 24) + addr, buf, size); + else write_core(dsp_id, addr, buf, size); +} + + +/****************************************************************************** +* Driver::write +******************************************************************************/ +int32_t Driver::write_core(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf, + uint32_t size) +{ + /*------------------------------------------------------------------------- + * Regular writes under 24k are faster than DMA writes (may change) + *------------------------------------------------------------------------*/ + if (size < 24 * 1024) + { + int status = pciedrv_dsp_write(dsp_id, addr, buf, size); + ERR(status, "PCIe Driver Write Error"); + return 0; + } + + Lock lock(this); + Cmem::instance()->dma_write(dsp_id, addr, buf, size); + return 0; +} + +void* Driver::map(DSPDevicePtr addr, uint32_t sz, bool is_read) +{ + return (void*) (uint64_t) addr; +} + +int32_t Driver::unmap(void *host_addr, DSPDevicePtr buf_addr, + uint32_t sz, bool is_write) +{ +} + +/****************************************************************************** +* Driver::read +******************************************************************************/ +int32_t Driver::read(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf, + uint32_t size) +{ + Cmem::instance()->dma_read(dsp_id, addr, buf, size); + return 0; +} + +/****************************************************************************** +* Driver::get_symbol +******************************************************************************/ +DSPDevicePtr Driver::get_symbol(void* image_handle, const char *name) +{ + DSPDevicePtr addr; + int ret = dnldmgr_get_symbol_address(image_handle, name, &addr); + if (ret) { printf("ERROR: Get symbol failed\n"); exit(-1); } + + return addr; +} + +/****************************************************************************** +* Driver::free_image_handle +******************************************************************************/ +void Driver::free_image_handle(void *handle) +{ + dnldmgr_free_image(handle); +} + +/****************************************************************************** +* Driver::cmem_setup +* Driver::shmem_configure +******************************************************************************/ +void Driver::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1, + DSPDevicePtr *addr2, uint32_t *size2) +{ +} + +void Driver::cmem_exit() +{ +} + +void Driver::shmem_configure(DSPDevicePtr addr, uint32_t size, int cmem_block) +{ +} + diff --git a/src/core/dsp/dspheap.h b/src/core/dsp/dspheap.h new file mode 100644 index 0000000..0668647 --- /dev/null +++ b/src/core/dsp/dspheap.h @@ -0,0 +1,200 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +/**************************************************************************//** +* @file dspheap.h +* +* @brief Define a dsp device heap manager run on the host. +* +* @version 1.00.00 +* +******************************************************************************/ +#ifndef _DSPHEAP_H +#define _DSPHEAP_H +#include <map> +#include <assert.h> +#include <cstdio> +#include <cstdlib> +#include "u_lockable.h" +#include "dspmem.h" + +#define ROUNDUP(val, pow2) (((val) + (pow2) - 1) & ~((pow2) - 1)) +#define MIN_BLOCK_SIZE 128 +#define MIN_CMEM_ONDEMAND_BLOCK_SIZE 4096 + +class dspheap : public Lockable +{ + typedef std::map<DSPDevicePtr64, uint64_t> block_list; + typedef block_list::iterator block_iter; + typedef block_list::value_type block_descriptor; + + public: + dspheap(DSPDevicePtr64 start_addr, uint64_t length) + { + configure(start_addr, length); + } + + dspheap() { } + + void configure(DSPDevicePtr64 start_addr, uint64_t length, + bool is_cmem_ondemand_heap = false) + { + /*--------------------------------------------------------------------- + * Ensure that the start_addr and length are multiples of 16M. + * 16M is the granularity of a memory region that can be controlled + * by a MAR register of C6x. + *--------------------------------------------------------------------*/ + //assert((length & 0xFFFFFF) == 0); + //assert(((uint32_t)start_addr & 0xFFFFFF) == 0); + + p_start_addr = start_addr; + p_length = length; + p_block_size = is_cmem_ondemand_heap ? MIN_CMEM_ONDEMAND_BLOCK_SIZE + : MIN_BLOCK_SIZE; + + Lock lock(this); + if (free_list.empty()) + free_list[start_addr] = length; + } + + ~dspheap() { } + + DSPDevicePtr64 malloc(uint32_t size, bool allow_fail=false) + { + size = min_block_size(size); + + Lock lock(this); + for (block_iter it = free_list.begin(); it != free_list.end(); ++it) + { + DSPDevicePtr64 block_addr = (*it).first; + uint64_t block_size = (*it).second; + + if (block_size >= size) + { + free_list.erase(it); + alloc_list[block_addr] = size; + + /*------------------------------------------------------------- + * if we only use a portion of the free block + *------------------------------------------------------------*/ + if (block_size > size) + free_list[(DSPDevicePtr64)block_addr+size] = block_size-size; + + return block_addr; + } + } + + if (!allow_fail) + { + printf("Malloc failed for size 0x%x from range (0x%08llx, 0x%08llx)\n", + size, p_start_addr, p_start_addr+p_length-1); + abort(); + } + + return 0; + } + + int free(DSPDevicePtr64 addr) + { + /*--------------------------------------------------------------------- + * Nothing to do if not an allocated address + *--------------------------------------------------------------------*/ + Lock lock(this); + block_iter it = alloc_list.find(addr); + if (it == alloc_list.end()) return -1; + + uint32_t size = (*it).second; + alloc_list.erase(it); + + /*--------------------------------------------------------------------- + * Merge the block with neighboring free blocks + *--------------------------------------------------------------------*/ + it = free_list.begin(); + while (it != free_list.end()) + { + DSPDevicePtr64 block_addr = (*it).first; + uint64_t block_size = (*it).second; + + if ( block_addr + block_size == addr + || addr + size == block_addr) + { + block_iter merge_it = it; + if (block_addr < addr) addr = block_addr; + size = block_size + size; + ++it; + free_list.erase(merge_it); + continue; + } + ++it; + } + free_list[addr] = size; + return 0; + } + + DSPDevicePtr64 size() const { return p_length; } + + DSPDevicePtr64 max_block_size(uint64_t &size, uint32_t &block_size) + { + if (p_length < p_block_size) + { + block_size = p_block_size; + size = 0; + return 0; + } + + DSPDevicePtr64 max_block_addr = 0; + uint64_t max_block_size = p_block_size; + + Lock lock(this); + for (block_iter it = free_list.begin(); it != free_list.end(); ++it) + { + DSPDevicePtr64 block_addr = (*it).first; + uint64_t block_size = (*it).second; + + if (block_size >= max_block_size) + { + max_block_addr = block_addr; + max_block_size = block_size; + } + } + + block_size = p_block_size; + size = max_block_size; + return max_block_addr; + } + + private: + block_list free_list; + block_list alloc_list; + DSPDevicePtr64 p_start_addr; + uint64_t p_length; + uint32_t p_block_size; + + uint32_t min_block_size(uint32_t size) { return ROUNDUP(size, p_block_size); } +}; + +#endif // _DSPHEAP_H diff --git a/src/core/dsp/dspmem.h b/src/core/dsp/dspmem.h new file mode 100644 index 0000000..f6c7c64 --- /dev/null +++ b/src/core/dsp/dspmem.h @@ -0,0 +1,59 @@ +/****************************************************************************** + * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include <stdint.h> +#ifndef _DSPMEM_H +#define _DSPMEM_H + + +typedef uint32_t DSPDevicePtr; +typedef uint64_t DSPDevicePtr64; +typedef uint32_t DSPVirtPtr; +// typedef uint64_t DSPVirtPtr64; // for future C7x? + +/***************************************************************************** + * DSP Device Memory Physical Addreess (8GB) + * 0x8:0000_0000 - 0x8:1FFF_FFFF: Linux reserved + * 0x8:2000_0000 - 0x8:21FF_FFFF: OCL runtime reserved + * using default MPAX translation, map to + * DSP virtual address 0xA000_0000 - 0xA1FF_FFFF + * 0x8:2200_0000 - 0x8:3FFF_FFFF: using default MPAX translation, map to + * DSP virtual address 0xA200_0000 - 0xBFFF_FFFF + * used for kernel code, user app small buffers + * 0x8:4000_0000 - 0x9:FFFF_FFFF: using custom MPAX translation settings, map + * to unused DSP virtual address spaces + * used for user app big buffers + *****************************************************************************/ +#define DSP_36BIT_ADDR 0x800000000ULL +#define MPAX_USER_MAPPED_DSP_ADDR 0x840000000ULL +#define ALL_PERSISTENT_MAX_DSP_ADDR 0x880000000ULL + +#define MSMC_OCL_START_ADDR 0x0C040000 +#define MSMC_OCL_END_ADDR 0x0C500000 + + +#endif // _DSPMEM_H diff --git a/src/core/dsp/genfile_cache.cpp b/src/core/dsp/genfile_cache.cpp new file mode 100644 index 0000000..c9b2472 --- /dev/null +++ b/src/core/dsp/genfile_cache.cpp @@ -0,0 +1,94 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "genfile_cache.h" + +std::string genfile_cache::lookup(llvm::Module *module, std::string options) +{ + std::vector<std::vector<std::string> > result; + uint32_t hash = convert_mod2crc(module, options); + + std::string query("select value from programs where hash = " + + boost::lexical_cast<std::string>(hash)); + + result = p_database.query(query.c_str()); + + if (!result.empty()) + { + string &filename = result[0][0]; + + struct stat statbuf; + if (stat(filename.c_str(), &statbuf) == 0) + return filename; + /*----------------------------------------------------------------- + * if (the cached filename no longer exists, remove it from the DB + *----------------------------------------------------------------*/ + else + { + std::string q2("delete from programs where hash = " + + boost::lexical_cast<std::string>(hash)); + + p_database.query(q2.c_str()); + return std::string(); + } + } + else return std::string(); +} + +void genfile_cache::remember(const char *outfile, llvm::Module *module, + std::string options) +{ + uint32_t hash = convert_mod2crc(module, options); + std::string query("insert into programs(hash, value) values(" + + boost::lexical_cast<std::string>(hash) + + ", \"" + + string(outfile) + + "\");"); + + p_database.query(query.c_str()); +} + +uint32_t genfile_cache::convert_mod2crc(llvm::Module *module, + std::string options) +{ + string llvm_ir; + + llvm::raw_string_ostream ostream(llvm_ir); + llvm::WriteBitcodeToFile(module, ostream); + ostream.str(); + + llvm_ir += options; + + return get_crc(llvm_ir); +} + +uint32_t genfile_cache::get_crc(std::string& my_string) +{ + boost::crc_32_type result; + result.process_bytes(my_string.data(), my_string.length()); + return result.checksum(); +} diff --git a/src/core/dsp/genfile_cache.h b/src/core/dsp/genfile_cache.h new file mode 100644 index 0000000..46b27f2 --- /dev/null +++ b/src/core/dsp/genfile_cache.h @@ -0,0 +1,101 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef _genfile_cache_ +#define _genfile_cache_ + +#include <llvm/Support/raw_ostream.h> +#include <llvm/IR/Module.h> +#include <llvm/Bitcode/ReaderWriter.h> + +#include <boost/lexical_cast.hpp> +#include <boost/crc.hpp> + +#include <sys/stat.h> + +#include <string> +#include <iostream> +#include <sstream> +#include <vector> +#include <stdint.h> +#include "u_locks_pthread.h" +#include "database.h" + +class genfile_cache +{ + public: + std::string lookup (llvm::Module *module, std::string options); + void remember (const char *outfile, llvm::Module *module, + std::string options); + + /*------------------------------------------------------------------------- + * Thread safe instance function for singleton behavior + *------------------------------------------------------------------------*/ + static genfile_cache* instance () + { + static Mutex Cache_instance_mutex; + genfile_cache* tmp = pInstance; + + __sync_synchronize(); + + if (tmp == 0) + { + ScopedLock lck(Cache_instance_mutex); + + tmp = pInstance; + if (tmp == 0) + { + char *user = getenv("USER"); + tmp = new genfile_cache("/tmp/opencl_ofdb_" + string(user)); + __sync_synchronize(); + pInstance = tmp; + } + } + return tmp; + } + + + private: + static genfile_cache* pInstance; + std::string p_dbname; + Database p_database; + + private: + genfile_cache(std::string db_name) : p_dbname(db_name), p_database(db_name.c_str()) + { + p_database.query("create table if not exists " + "programs(hash integer, value string);"); + } + + uint32_t convert_mod2crc (llvm::Module *module, std::string options); + uint32_t get_crc (std::string& my_string); + + genfile_cache(const genfile_cache&); // copy ctor disallowed + genfile_cache& operator=(const genfile_cache&); // assignment disallowed +}; + +#endif // _genfile_cache_ diff --git a/src/core/dsp/kernel.cpp b/src/core/dsp/kernel.cpp new file mode 100644 index 0000000..291673a --- /dev/null +++ b/src/core/dsp/kernel.cpp @@ -0,0 +1,718 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "kernel.h" +#include "device.h" +#include "buffer.h" +#include "program.h" +#include "utils.h" +#include "u_locks_pthread.h" +#include "mailbox.h" + +#include "../kernel.h" +#include "../memobject.h" +#include "../events.h" +#include "../program.h" + +#include <llvm/IR/Function.h> +#include <llvm/IR/Constants.h> +#include <llvm/IR/Instructions.h> +#include <llvm/IR/LLVMContext.h> +#include <llvm/IR/Module.h> +#include <llvm/ExecutionEngine/ExecutionEngine.h> + +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <string> +#include <vector> +#include <unistd.h> +#include <sys/mman.h> + +extern "C" +{ + #include <ti/runtime/mmap/include/mmap_resource.h> +} + + +#define ROUNDUP(val, pow2) (((val) + (pow2) - 1) & ~((pow2) - 1)) +#define QERR(msg, retcode) do {if (getenv("TI_OCL_VERBOSE_ERROR")) std::cerr << msg << std::endl; return retcode; } while(0) +#define ERR(x) std::cerr << x << std::endl +#define ERROR() std::cerr << "Unknown error in dsp/kernel.cpp" << std::endl + +using namespace Coal; + +DSPKernel::DSPKernel(DSPDevice *device, Kernel *kernel) +: DeviceKernel(), p_device(device), p_kernel(kernel), + p_device_entry_pt((DSPDevicePtr)0), + p_data_page_ptr ((DSPDevicePtr)0xffffffff) +{ +} + +DSPKernel::~DSPKernel() +{ +} + + +template<typename T> +T k_exp(T base, unsigned int e) +{ + T rs = base; + for (unsigned int i=1; i<e; ++i) rs *= base; + return rs; +} + +/*----------------------------------------------------------------------------- +* This and the next function are called from the multiple worker threads. They +* may all enter the set the name section, but they will all set the same value, +* so even though there is a race, there is no race error. when work group +* division is pushed down to the dsp, the race will go away. +*----------------------------------------------------------------------------*/ +DSPDevicePtr DSPKernel::device_entry_pt() +{ + if (!p_device_entry_pt) + { + size_t name_length; + p_kernel->info(CL_KERNEL_FUNCTION_NAME, 0, 0, &name_length); + + void *name = malloc(name_length); + p_kernel->info(CL_KERNEL_FUNCTION_NAME, name_length, name, 0); + + Program *p = (Program *)p_kernel->parent(); + DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device)); + + if (!prog->is_loaded()) ERROR(); + p_device_entry_pt = prog->query_symbol((char*)name); + free (name); + } + return p_device_entry_pt; +} + +/****************************************************************************** +* The data page pointer can frequently be 0, so we will initialize it to be +* 0xffffffff as a start value instead of 0. +******************************************************************************/ +DSPDevicePtr DSPKernel::data_page_ptr() +{ + if (p_data_page_ptr == (DSPDevicePtr)0xffffffff) + { + Program *p = (Program *)p_kernel->parent(); + DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device)); + + if (!prog->is_loaded()) ERROR(); + //p_data_page_ptr = prog->query_symbol("__TI_STATIC_BASE"); + p_data_page_ptr = prog->data_page_ptr(); + } + return p_data_page_ptr; +} + +/****************************************************************************** +* void DSPKernel::preAllocBuffers() +******************************************************************************/ +cl_int DSPKernel::preAllocBuffers() +{ + for (unsigned int i=0; i < kernel()->numArgs(); ++i) + { + const Kernel::Arg &arg = kernel()->arg(i); + + if (arg.kind() == Kernel::Arg::Buffer && + arg.file() != Kernel::Arg::Local) + { + MemObject *buffer = *(MemObject **)arg.data(); + if (buffer && !buffer->allocate(device())) + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + } + return CL_SUCCESS; +} + + +/****************************************************************************** +* Try to find the size a work group needs to be executed the fastest on the DSP. +******************************************************************************/ +size_t DSPKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim, + size_t global_work_size) const +{ + // ASW TODO - what the ???? + unsigned int dsps = p_device->numDSPs(); + + /*------------------------------------------------------------------------- + * Don't break in too small parts + *------------------------------------------------------------------------*/ + if (k_exp(global_work_size, num_dims) > 64) + return global_work_size; + + /*------------------------------------------------------------------------- + * Find the divisor of global_work_size the closest to dsps but >= than it + *------------------------------------------------------------------------*/ + unsigned int divisor = dsps <= 0 ? 1 : dsps; + + while (true) + { + if ((global_work_size % divisor) == 0) + break; + + /*--------------------------------------------------------------------- + * Don't let the loop go up to global_work_size, the overhead would be + * too huge + *--------------------------------------------------------------------*/ + if (divisor > global_work_size || divisor > dsps * 32) + { + divisor = 1; // Not parallel but has no CommandQueue overhead + break; + } + + divisor -= 1; + } + + /*------------------------------------------------------------------------- + * Return the size + *------------------------------------------------------------------------*/ + return global_work_size / divisor; +} + +/****************************************************************************** +* localMemSize() +******************************************************************************/ +cl_ulong DSPKernel::localMemSize() const +{ + cl_ulong local_mem = 0; + + for (int i = 0; i < kernel()->numArgs(); ++i) + { + const Kernel::Arg &arg = kernel()->arg(i); + + if (arg.kind() == Kernel::Arg::Buffer && + arg.file() == Kernel::Arg::Local) + local_mem += arg.allocAtKernelRuntime(); + } + + return local_mem; +} + +Kernel * DSPKernel::kernel() const { return p_kernel; } +DSPDevice * DSPKernel::device() const { return p_device; } + +// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two +template <class T> +T next_power_of_two(T k) +{ + if (k == 0) return 1; + + k--; + for (int i=1; i<sizeof(T)*8; i<<=1) + k = k | k >> i; + return k+1; +} + +size_t DSPKernel::typeOffset(size_t &offset, size_t type_len) +{ + size_t rs = offset; + + // Align offset to stype_len + type_len = next_power_of_two(type_len); + if (type_len > 8) type_len = 8; // The c66 has no alignment need > 8 bytes + + size_t mask = ~(type_len - 1); + + while (rs & mask != rs) + rs++; + + // Where to try to place the next value + offset = rs + type_len; + + return rs; +} + +static int kernelID = 0; + +/*============================================================================= +* DSPKernelEvent +*============================================================================*/ +DSPKernelEvent::DSPKernelEvent(DSPDevice *device, KernelEvent *event) +: p_device(device), p_event(event), p_kernel((DSPKernel*)event->deviceKernel()), + p_kernel_id(kernelID++), p_debug_kernel(false), p_num_arg_words(0), + p_WG_alloca_start(0) +{ + char *dbg = getenv("TI_OCL_DEBUG_KERNEL"); + if (dbg) p_debug_kernel = true; + + callArgs(MAX_ARG_BUF_SIZE); +} + +DSPKernelEvent::~DSPKernelEvent() { } + +#define READ_ONLY_BUFFER(buffer) (buffer->flags() & CL_MEM_READ_ONLY) +#define WRITE_ONLY_BUFFER(buffer) (buffer->flags() & CL_MEM_WRITE_ONLY) + +#define SETARG(val) if (arg_words < args_in_mem_size) args_in_mem[arg_words++] = val; \ + else std::cerr << "To many argument bytes are needed" << std::endl + +#define SETMOREARG(sz, pval) do \ + { \ + more_arg_offset = ROUNDUP(more_arg_offset, sz); \ + if (ROUNDUP(more_arg_offset + sz, 8) > sizeof(p_msg.u.k.flush.buffers))\ + std::cerr << "Too many arguments, does not fit" << std::endl; \ + memcpy(more_args_in_mem+more_arg_offset, pval, sz); \ + more_arg_offset += sz; \ + } while(0) + +//#define SETMOREARG(sz,psrc) + +/****************************************************************************** +* DSPKernelEvent::callArgs +******************************************************************************/ +void DSPKernelEvent::callArgs(unsigned args_in_mem_size) +{ + int arg_words = 0; + unsigned *args_in_mem = (unsigned*)p_msg.u.k.kernel.argBuf; + char *more_args_in_mem = (char *)p_msg.u.k.flush.buffers; + int more_arg_offset = 4; + bool is_more_arg = false; + + /*------------------------------------------------------------------------- + * Write Arguments + *------------------------------------------------------------------------*/ + for (int i = 0; i < p_kernel->kernel()->numArgs(); ++i) + { + is_more_arg = (i >= 10); + + const Kernel::Arg & arg = p_kernel->kernel()->arg(i); + size_t size = arg.valueSize() * arg.vecDim(); + + if (size == 0) ERR("Kernel Argument has size == 0"); + if (size != 1 && size != 2 && size != 4 && size != 8) + ERR("Invalid Kernel Argument size"); + + /*--------------------------------------------------------------------- + * We may have to perform some changes in the values (buffers, etc) + *--------------------------------------------------------------------*/ + switch (arg.kind()) + { + case Kernel::Arg::Buffer: + { + MemObject *buffer = 0; + DSPDevicePtr buf_ptr = 0; + if (arg.data()) buffer = *(MemObject **)arg.data(); + if (!is_more_arg) SETARG(sizeof(DSPVirtPtr)); + + DSPVirtPtr *buf_dspvirtptr = (!is_more_arg) ? + (&args_in_mem[arg_words]) : + (DSPVirtPtr *)(more_args_in_mem+ROUNDUP(more_arg_offset,4)); + + /*------------------------------------------------------------- + * Alloc a buffer and pass it to the kernel + *------------------------------------------------------------*/ + if (arg.file() == Kernel::Arg::Local) + { + uint32_t lbufsz = arg.allocAtKernelRuntime(); + p_local_bufs.push_back(LocalPair(buf_dspvirtptr, lbufsz)); + + /*----------------------------------------------------- + * Since the only reader and writer of local memory (L2) + * will be the core itself, I do not believe we need + * to flush local buffers for correctness. + *----------------------------------------------------*/ + //p_flush_bufs->push_back(DSPMemRange(lbuf, lbufsz)); + } + else if (buffer != NULL) + { + /*--------------------------------------------------------- + * Get the DSP buffer, allocate it and get its pointer + *--------------------------------------------------------*/ + if (buffer->flags() & CL_MEM_USE_HOST_PTR) + { + p_hostptr_tmpbufs.push_back( + HostptrPair(buffer, DSPPtrPair(0, buf_dspvirtptr))); + } + else + { + DSPBuffer *dspbuf = (DSPBuffer *)buffer->deviceBuffer(p_device); + buffer->allocate(p_device); + DSPDevicePtr64 addr64 = dspbuf->data(); + if (addr64 < 0xFFFFFFFF) + buf_ptr = addr64; + else + p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair( + addr64, buf_dspvirtptr), buffer->size())); + + if (! WRITE_ONLY_BUFFER(buffer)) + p_flush_bufs.push_back(DSPMemRange(DSPPtrPair( + addr64, buf_dspvirtptr), buffer->size())); + } + } + + /*--------------------------------------------------------- + * Use 0 for local buffer address here, it will be overwritten + * with allocated local buffer address at kernel dispatch time. + * Same for allocating temporary buffer for use_host_ptr. + *--------------------------------------------------------*/ + if (!is_more_arg) SETARG(buf_ptr); + else { SETMOREARG(4, &buf_ptr); } + + break; + } + + case Kernel::Arg::Image2D: + case Kernel::Arg::Image3D: ERR("Images not yet supported"); break; + + /*----------------------------------------------------------------- + * Non-Buffers + *----------------------------------------------------------------*/ + default: + if (!is_more_arg) + { + SETARG((size < 4 ? 4 : size)); + // Cast to (int) to avoid a codegen bug + // ZEXT will happen in LLVM and ICODE, so don't worry + if (size == 1) SETARG(((int) *((signed char*)arg.data()))); + else if (size == 2) SETARG(((int) *((short*)arg.data()))); + else SETARG(*((unsigned*) arg.data())); + if (size == 8) { SETARG(*(((unsigned*)arg.data()) + 1)); } + } + else { SETMOREARG(size, arg.data()); } + break; + } + } + SETARG(0); // 0 terminator for args area + + p_num_arg_words = arg_words; + p_msg.u.k.flush.sizeMoreArgs = (more_arg_offset > 4) ? + ROUNDUP(more_arg_offset, 8) : 0; +} + +/****************************************************************************** +* debug_pause +******************************************************************************/ +static void debug_pause(uint32_t entry, uint32_t dsp_id, + const char* outfile, char *name) +{ + printf("[OCL] Launching kernel %s on DSP %d\n", name, dsp_id); + printf("[OCL] Connect debugger and set breakpoint at 0x%08x\n", entry); + printf("[OCL] Load symbols from file %s\n", outfile); + printf("[OCL] Press any key, then enter to continue\n"); + do { char t; std::cin >> t; } while(0); +} + + + +/****************************************************************************** +* bool DSPKernelEvent::run() +******************************************************************************/ +cl_int DSPKernelEvent::run(Event::Type evtype) +{ + Program *p = (Program *)p_kernel->kernel()->parent(); + DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device)); + + // TODO perhaps ensure that prog is loaded. + + int dim = p_event->work_dim(); + + /*------------------------------------------------------------------------- + * Create a message for the DSP + *------------------------------------------------------------------------*/ + Msg_t &msg = p_msg; + kernel_config_t *cfg = &msg.u.k.kernel.config; + + if (evtype == Event::TaskKernel) + { + msg.command = TASK; + cfg->Kernel_id = p_kernel_id; + + CommandQueue *q = (CommandQueue *) p_event->parent(); + cl_command_queue_properties q_prop = 0; + q->info(CL_QUEUE_PROPERTIES, sizeof(q_prop), &q_prop, NULL); + cfg->global_sz_0 = (q_prop & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) ? + OUT_OF_ORDER_TASK_SIZE : IN_ORDER_TASK_SIZE; + cfg->local_sz_0 = 1; + cfg->local_sz_1 = 1; + cfg->local_sz_2 = 1; + } + else + { + msg.command = NDRKERNEL; + + cfg->num_dims = dim; + cfg->global_sz_0 = p_event->global_work_size(0); + cfg->global_sz_1 = dim > 1 ? p_event->global_work_size(1) : 1; + cfg->global_sz_2 = dim > 2 ? p_event->global_work_size(2) : 1; + cfg->local_sz_0 = p_event->local_work_size(0); + cfg->local_sz_1 = dim > 1 ? p_event->local_work_size(1) : 1; + cfg->local_sz_2 = dim > 2 ? p_event->local_work_size(2) : 1; + cfg->global_off_0 = p_event->global_work_offset(0); + cfg->global_off_1 = p_event->global_work_offset(1); + cfg->global_off_2 = p_event->global_work_offset(2); + cfg->WG_gid_start_0 = 0; + cfg->WG_gid_start_1 = 0; + cfg->WG_gid_start_2 = 0; + cfg->Kernel_id = p_kernel_id; + cfg->WG_id = 0; + cfg->stats = 0; + } + + msg.u.k.kernel.entry_point = (unsigned)p_kernel->device_entry_pt(); + msg.u.k.kernel.data_page_ptr = (unsigned)p_kernel->data_page_ptr(); + + /*------------------------------------------------------------------------- + * Allocating local buffer in L2 per kernel run instance + *------------------------------------------------------------------------*/ + uint32_t total_sz, block_sz; + DSPDevicePtr local_scratch = p_device->get_local_scratch(total_sz, block_sz); + for (size_t i = 0; i < p_local_bufs.size(); ++i) + { + DSPVirtPtr *p_arg_word = p_local_bufs[i].first; + unsigned local_buf_size = p_local_bufs[i].second; + + uint32_t rounded_sz = ROUNDUP(local_buf_size, block_sz); + if (rounded_sz > total_sz) + { + QERR("Total local buffer size exceeds available local size", + CL_MEM_OBJECT_ALLOCATION_FAILURE); + } + *p_arg_word = local_scratch; + local_scratch += rounded_sz; + total_sz -= rounded_sz; + } + + /*------------------------------------------------------------------------- + * Allocating temporary space in global memory for kernel alloca'ed data + *------------------------------------------------------------------------*/ +#define NUM_CORES_PER_CHIP 8 + cfg->WG_alloca_size = p_kernel->kernel()->get_wi_alloca_size() * + cfg->local_sz_0 * cfg->local_sz_1 * cfg->local_sz_2; + if (cfg->WG_alloca_size > 0) + { + cfg->WG_alloca_size += 4096; // 4K bytes padding between WGs' allocas + uint32_t chip_alloca_size = cfg->WG_alloca_size * NUM_CORES_PER_CHIP; + p_WG_alloca_start = p_device->malloc_global( // malloc abort if fail + chip_alloca_size, true); + if (!p_WG_alloca_start) + { + QERR("Alloca size exceeds available global memory", + CL_OUT_OF_RESOURCES); + } + + if (p_WG_alloca_start < 0xFFFFFFFF) + cfg->WG_alloca_start = (DSPVirtPtr) p_WG_alloca_start; + else + p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair( + p_WG_alloca_start, &cfg->WG_alloca_start), chip_alloca_size)); + } + + /*------------------------------------------------------------------------- + * Allocating temporary global buffer for use_host_ptr + *------------------------------------------------------------------------*/ + for (int i = 0; i < p_hostptr_tmpbufs.size(); ++i) + { + MemObject *buffer = p_hostptr_tmpbufs[i].first; + DSPDevicePtr64 *p_addr64 = &p_hostptr_tmpbufs[i].second.first; + DSPVirtPtr *p_arg_word = p_hostptr_tmpbufs[i].second.second; + + *p_addr64 = p_device->malloc_global(buffer->size(), false); + + if (!p_addr64) + { + QERR("Temporary memory for CL_MEM_USE_HOST_PTR buffer exceeds available global memory", + CL_MEM_OBJECT_ALLOCATION_FAILURE); + } + + if (*p_addr64 < 0xFFFFFFFF) + *p_arg_word = *p_addr64; + else + p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair( + *p_addr64, p_arg_word), buffer->size())); + + if (! WRITE_ONLY_BUFFER(buffer)) + { + void *mapped_tmpbuf = Driver::instance()->map(*p_addr64, + buffer->size(), false); + memcpy(mapped_tmpbuf, buffer->host_ptr(), buffer->size()); + p_flush_bufs.push_back(DSPMemRange(DSPPtrPair( + *p_addr64, p_arg_word), buffer->size())); + Driver::instance()->unmap(mapped_tmpbuf, *p_addr64, + buffer->size(), true); + } + } + + /*------------------------------------------------------------------------- + * Compute MPAX mappings from DSPDevicePtr64 to DSPVirtPtr in p_64bit_bufs + *------------------------------------------------------------------------*/ + msg.u.k.flush.num_mpaxs = 0; + uint32_t num_64bit_bufs = p_64bit_bufs.size(); + if (num_64bit_bufs > 0) + { + uint64_t *phys_addrs = new uint64_t[num_64bit_bufs]; + uint32_t *lengths = new uint32_t[num_64bit_bufs]; + uint32_t *prots = new uint32_t[num_64bit_bufs]; + uint32_t *virt_addrs = new uint32_t[num_64bit_bufs]; + for (int i = 0; i < p_64bit_bufs.size(); ++i) + { + phys_addrs[i] = p_64bit_bufs[i].first.first; + lengths[i] = p_64bit_bufs[i].second; + prots[i] = 0; // don't care yet + } + + keystone_mmap_resources_t mpax_res; + memcpy(&mpax_res, p_device->get_mpax_default_res(), + sizeof(keystone_mmap_resources_t)); + if (keystone_mmap_resource_alloc(num_64bit_bufs, phys_addrs, lengths, + prots, virt_addrs, &mpax_res) != KEYSTONE_MMAP_RESOURCE_NOERR) + { + QERR("MPAX allocation failed!", + CL_OUT_OF_RESOURCES); + } + + // set the MPAX settings in the message + uint32_t mpax_used = 0; + for (; mpax_res.mapping[mpax_used].segsize_power2 > 0; mpax_used += 1) + { + msg.u.k.flush.mpax_settings[2*mpax_used ] = (uint32_t) + (mpax_res.mapping[mpax_used].raddr >> 12); // e.g. 0x822004 + msg.u.k.flush.mpax_settings[2*mpax_used+1] = // e.g. 0xC000000D + mpax_res.mapping[mpax_used].baddr + | (mpax_res.mapping[mpax_used].segsize_power2-1); + } + msg.u.k.flush.num_mpaxs = mpax_used; + + // set the virtual address in arguments + for (int i = 0; i < p_64bit_bufs.size(); ++i) + { + *(p_64bit_bufs[i].first.second) = virt_addrs[i]; + if (p_debug_kernel) + printf("Virtual = 0x%x, physical = 0x%llx\n", + virt_addrs[i], p_64bit_bufs[i].first.first); + } + delete [] phys_addrs; + delete [] lengths; + delete [] prots; + delete [] virt_addrs; + } + + /*------------------------------------------------------------------------- + * Helpful information for debugging a kernel + *------------------------------------------------------------------------*/ + if (p_debug_kernel) + { + for (int i = 0; i < msg.u.k.flush.num_mpaxs; i++) + printf("mpax %d: l=0x%x, h=0x%x\n", i, + msg.u.k.flush.mpax_settings[2*i], + msg.u.k.flush.mpax_settings[2*i+1]); + + uint32_t *args = msg.u.k.kernel.argBuf; + int arg_num = 1; + // TODO: print more args properly + for (int i=0; i < p_num_arg_words; i++) + { + if (args[i] == 4) + { + i++; + printf("[OCL] Kernel argument %d = 0x%08x\n", arg_num, args[i]); + } + else if (args[i] == 8) + { + printf("[OCL] Kernel argument %d = 0x%08x 0x%08x\n", + arg_num, args[i+1], args[i+2]); + i+=2; + } + arg_num++; + } + } + + /*------------------------------------------------------------------------- + * Make sure we do not overflow the number of commands a mailbox can handle + *------------------------------------------------------------------------*/ + if (p_flush_bufs.size() > MAX_KERNEL_ARGUMENTS) + { + QERR("To many buffers to flush", CL_OUT_OF_RESOURCES); + } + + /*------------------------------------------------------------------------- + * Populate Flush commands for any buffers that are read by the DSP + *------------------------------------------------------------------------*/ + msg.u.k.flush.numBuffers = p_flush_bufs.size(); + +#if 0 // YUAN: flush buffers used for more arguments (for now) + for (int i=0; i < p_flush_bufs.size(); ++i) + { + msg.u.k.flush.buffers[2*i] = p_flush_bufs[i].first; + msg.u.k.flush.buffers[2*i+1] = p_flush_bufs[i].second; + } +#endif + + /*------------------------------------------------------------------------- + * Feedback to user for debug + *------------------------------------------------------------------------*/ + if (p_debug_kernel) + { + size_t name_length; + p_kernel->kernel()->info(CL_KERNEL_FUNCTION_NAME, 0, 0, &name_length); + char *name = (char*)malloc(name_length); + if (!name) return CL_OUT_OF_HOST_MEMORY; + p_kernel->kernel()->info(CL_KERNEL_FUNCTION_NAME, name_length, name, 0); + + debug_pause(p_kernel->device_entry_pt(), p_device->dspID(), + prog->outfile_name(), name); + free (name); + } + + /*------------------------------------------------------------------------- + * Dispatch the commands through the mailbox + *------------------------------------------------------------------------*/ + p_device->mail_to(msg); + + /*------------------------------------------------------------------------- + * Do not wait for completion + *------------------------------------------------------------------------*/ + return CL_SUCCESS; +} + +/****************************************************************************** +* free_tmp_bufs allocated for kernel allocas, and for use_host_ptr +******************************************************************************/ +void DSPKernelEvent::free_tmp_bufs() +{ + if (p_WG_alloca_start > 0) + p_device->free_global(p_WG_alloca_start); + + for (int i = 0; i < p_hostptr_tmpbufs.size(); ++i) + { + MemObject *buffer = p_hostptr_tmpbufs[i].first; + DSPDevicePtr64 addr64 = p_hostptr_tmpbufs[i].second.first; + + if (! READ_ONLY_BUFFER(buffer)) + { + void *mapped_tmpbuf = Driver::instance()->map(addr64, + buffer->size(), true); + memcpy(buffer->host_ptr(), mapped_tmpbuf, buffer->size()); + Driver::instance()->unmap(mapped_tmpbuf, addr64, + buffer->size(), false); + } + p_device->free_global(addr64); + } + +} + diff --git a/src/core/dsp/kernel.h b/src/core/dsp/kernel.h new file mode 100644 index 0000000..850941d --- /dev/null +++ b/src/core/dsp/kernel.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __DSP_KERNEL_H__ +#define __DSP_KERNEL_H__ + +#include "../events.h" +#include "../memobject.h" +#include "../deviceinterface.h" +#include "message.h" +#include "device.h" +#include <core/config.h> + +#include <vector> +#include <string> +#include <pthread.h> +#include <stdint.h> + +namespace llvm +{ + class Function; +} + +typedef std::pair<DSPDevicePtr64, DSPVirtPtr *> DSPPtrPair; +typedef std::pair<DSPPtrPair, uint32_t> DSPMemRange; +typedef std::pair<DSPVirtPtr *, uint32_t> LocalPair; +typedef std::pair<Coal::MemObject *, DSPPtrPair> HostptrPair; + + +namespace Coal +{ +class DSPDevice; +class Kernel; +class KernelEvent; + +class DSPKernel : public DeviceKernel +{ + public: + DSPKernel(DSPDevice *device, Kernel *kernel); + ~DSPKernel(); + + size_t workGroupSize() const { return 128; } + cl_ulong localMemSize() const ; + cl_ulong privateMemSize() const { return 0; } + size_t preferredWorkGroupSizeMultiple() const { return 0; } + + size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim, + size_t global_work_size) const; + DSPDevicePtr device_entry_pt(); + DSPDevicePtr data_page_ptr(); + cl_int preAllocBuffers(); + + Kernel * kernel() const; + DSPDevice * device() const; + + llvm::Function *function() const; + static size_t typeOffset(size_t &offset, size_t type_len); + + private: + DSPDevice * p_device; + Kernel * p_kernel; + DSPDevicePtr p_device_entry_pt; + DSPDevicePtr p_data_page_ptr; +}; + +class DSPKernelEvent +{ + public: + DSPKernelEvent (DSPDevice *device, KernelEvent *event); + ~DSPKernelEvent (); + + cl_int run (Event::Type evtype); + void callArgs (unsigned rs_size); + + DSPDevice* device() { return p_device; } + uint32_t kernel_id() { return p_kernel_id; } + + void free_tmp_bufs(); + + private: + DSPDevice * p_device; + KernelEvent * p_event; + DSPKernel * p_kernel; + uint32_t p_kernel_id; + bool p_debug_kernel; + int p_num_arg_words; + Msg_t p_msg; + DSPDevicePtr64 p_WG_alloca_start; + std::vector<DSPMemRange> p_flush_bufs; + std::vector<LocalPair> p_local_bufs; + std::vector<HostptrPair> p_hostptr_tmpbufs; + std::vector<DSPMemRange> p_64bit_bufs; +}; +} +#endif diff --git a/src/core/dsp/mailbox.h b/src/core/dsp/mailbox.h new file mode 100644 index 0000000..f87c08c --- /dev/null +++ b/src/core/dsp/mailbox.h @@ -0,0 +1,114 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef _MAILBOX_H_ +#define _MAILBOX_H_ +#include "u_locks_pthread.h" +#include "driver.h" + +extern "C" +{ + #include "mpm_mailbox.h" +} + +class Mailbox +{ + public: + + int32_t create(void* mbox_handle, char *slave_node_name, + uint32_t mem_location, uint32_t direction, + mpm_mailbox_config_t *mbox_config) + { + int32_t result = mpm_mailbox_create(mbox_handle, slave_node_name, + mem_location, direction, mbox_config); + return result; + } + + int32_t open(void* mbox_handle) + { + int32_t result = mpm_mailbox_open(mbox_handle); + return result; + } + + int32_t write (void* mbox_handle, uint8_t *buf, uint32_t size, + uint32_t trans_id) + { + int result; + + do result = mpm_mailbox_write (mbox_handle, buf, size, trans_id); + while (result == MPM_MAILBOX_ERR_MAIL_BOX_FULL); + + return true; + } + + int32_t read (void* mbox_handle, uint8_t *buf, uint32_t *size, + uint32_t *trans_id) + { + int32_t result = mpm_mailbox_read (mbox_handle, buf, size, trans_id); + return result; + } + + int32_t query (void* mbox_handle) + { + int32_t result = mpm_mailbox_query (mbox_handle); + return result; + } + + /*------------------------------------------------------------------------- + * Thread safe instance function for singleton behavior + *------------------------------------------------------------------------*/ + static Mailbox* instance () + { + static Mutex Mailbox_instance_mutex; + Mailbox* tmp = pInstance; + + __sync_synchronize(); + + if (tmp == 0) + { + ScopedLock lck(Mailbox_instance_mutex); + + tmp = pInstance; + if (tmp == 0) + { + tmp = new Mailbox; + __sync_synchronize(); + pInstance = tmp; + } + } + return tmp; + } + + private: + static Mailbox* pInstance; + + Mailbox() { } // ctor private + Mailbox(const Mailbox&); // copy ctor disallowed + Mailbox& operator=(const Mailbox&); // assignment disallowed +}; + +#endif // _MAILBOX_H_ diff --git a/src/core/dsp/memmap.h b/src/core/dsp/memmap.h new file mode 100644 index 0000000..503540e --- /dev/null +++ b/src/core/dsp/memmap.h @@ -0,0 +1,120 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); } + +class DSP_MappedMem +{ + public: + DSP_MappedMem(uint32_t dsp_id, uint32_t size) + : p_size(size), p_dsp_id(dsp_id), p_dsp_addr(0) + p_num_buffers(CEIL_DIVIDE(size, HOST_CMEM_BUFFER_SIZE)) + { + p_buffers = new [p_num_buffers] cmem_host_buf_desc_t; + ERR(!p_buffers, "Cannot allocate host memory for a DSP Mapped Region"); + + int status + for (int i = 0; i< num_buffers; i++) + { + status = bufmgrAlloc(DmaBufPool, 1, &p_buffers[i]); + ERR(status, "Cannot allocate CMEM pool for a DSP Mapped Region"); + } + + /*--------------------------------------------------------------------- + * Allocate DSP range + *--------------------------------------------------------------------*/ + status = pciedrv_dsp_memrange_alloc(dsp_id, size, p_dsp_addr); + ERR(status, "PCIe driver dsp memrange alloc failed"); + + /*--------------------------------------------------------------------- + * Map Input buffers to dsp range + *--------------------------------------------------------------------*/ + status = pciedrv_map_bufs_to_dsp_memrange(dsp_id, num_buffers, + p_buffers, (uint32_t) p_dsp_addr); + ERR(status, "PCIe driver dsp map bufs to memrange failed"); + } + + ~DSP_MappedMem() + { + /*--------------------------------------------------------------------- + * Free DSP range + *--------------------------------------------------------------------*/ + int status = pciedrv_dsp_memrange_free(dsp_id, size, p_dsp_addr); + ERR(status, "PCIe driver dsp memrange free failed"); + + for (int i = 0; i< num_buffers; i++) + { + status = bufmgrFreeDesc(DmaBufPool, &p_buffers[i]); + ERR(status, "Cannot free CMEM pool for a DSP Mapped Region"); + } + + delete [p_num_buffers] p_buffers; + } + + void copy_in(void* p, uint32_t size) + { + ERR(size > p_size, "DSP Mapped region input overflow"); + + uint32_t remaining_size = size; + uint32_t offset = 0; + + for (int i = 0; remaining_size; i++) + { + int chunk_size = std::min(remaining_size, p_buffers[i].length); + + memcpy(p_buffers[i].user_addr, p + offset, chunk_size); + + remaining_size -= chunk_size; + offset += chunk_size; + } + } + + void copy_out(void* p, uint32_t size) + { + ERR(size > p_size, "DSP Mapped region output underrflow"); + + uint32_t remaining_size = size; + uint32_t offset = 0; + + for (int i = 0; remaining_size; i++) + { + int chunk_size = std::min(remaining_size, p_buffers[i].length); + + memcpy(p + offset, p_buffers[i].user_addr, chunk_size); + + remaining_size -= chunk_size; + offset += chunk_size; + } + } + + private: + uint32_t p_size; + uint32_t p_dsp_id; + uint32_t p_dsp_addr; + uint32_t p_num_buffers; + cmem_host_buf_desc_t *p_buffers; +}; diff --git a/src/core/dsp/message.h b/src/core/dsp/message.h new file mode 100644 index 0000000..d93fe1e --- /dev/null +++ b/src/core/dsp/message.h @@ -0,0 +1,115 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __MESSAGE_H_ +#define __MESSAGE_H_ + +#include <stdint.h> + +typedef enum { READY, EXIT, TASK, NDRKERNEL, WORKGROUP, CACHEINV, FREQUENCY, SUCCESS, ERROR, PRINT } command_codes; + +#define MAX_KERNEL_ARGUMENTS 10 +#define MAX_ARG_BUF_SIZE (MAX_KERNEL_ARGUMENTS*3)+1 +#define MAX_FLUSH_BUF_SIZE (MAX_KERNEL_ARGUMENTS*2) + +#define MAX_XMCSES_MPAXS 7 +#define FIRST_FREE_XMC_MPAX 3 // XMC MPAXs available: 3 - F +#define FIRST_FREE_SES_MPAX 1 // SES MPAXs available: 1 - 7 + +/****************************************************************************** +* Need to ensure that the alignments and therefore the offsets of all fields +* are consistent between the host and the device. +******************************************************************************/ +typedef struct +{ + uint32_t num_dims; + + uint32_t global_sz_0; + uint32_t global_sz_1; + uint32_t global_sz_2; + uint32_t local_sz_0; + uint32_t local_sz_1; + uint32_t local_sz_2; + uint32_t global_off_0; + uint32_t global_off_1; + uint32_t global_off_2; + uint32_t WG_gid_start_0; + uint32_t WG_gid_start_1; + uint32_t WG_gid_start_2; + uint32_t Kernel_id; + uint32_t WG_id; + uint32_t stats; + uint32_t WG_alloca_start; + uint32_t WG_alloca_size; +} kernel_config_t; + +typedef struct +{ + uint8_t numBuffers; + uint8_t num_mpaxs; // TODO: XMC only mpax for kernel alloca memory + uint16_t sizeMoreArgs; + uint32_t buffers[MAX_FLUSH_BUF_SIZE]; + uint32_t mpax_settings[2*MAX_XMCSES_MPAXS]; // (MPAXL, MPAXH) pair +} flush_msg_t; + +typedef struct +{ + kernel_config_t config; + uint32_t entry_point; + uint32_t data_page_ptr; + uint32_t argBuf[MAX_ARG_BUF_SIZE]; // NULL size terminated +} kernel_msg_t; + +typedef struct +{ + command_codes command; + union + { + struct + { + kernel_msg_t kernel; + flush_msg_t flush; + } k; + char message[sizeof(kernel_msg_t) + sizeof(flush_msg_t)]; + } u; +} Msg_t; + +static Msg_t exitMsg = {EXIT}; +static Msg_t successMsg = {SUCCESS}; +static Msg_t readyMsg = {READY}; +static Msg_t errorMsg = {ERROR}; +static Msg_t frequencyMsg = {FREQUENCY}; +// static far Msg_t printMsg = {PRINT}; // moved to L2 in monitor + +static const uint32_t mbox_payload = sizeof(Msg_t); + +#define MBOX_SIZE 0x2000 + +#define IN_ORDER_TASK_SIZE 1 +#define OUT_OF_ORDER_TASK_SIZE (IN_ORDER_TASK_SIZE+1) + +#endif diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c new file mode 100644 index 0000000..545ba92 --- /dev/null +++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c @@ -0,0 +1,200 @@ +/* +* c60_dynamic.c +* +* C6x-specific dynamic loader functionality +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifdef C60_TARGET +#include "c60_elf32.h" +#include <inttypes.h> +#include "dload.h" + +/*****************************************************************************/ +/* c60_process_dynamic_tag() */ +/* */ +/* Process C6x specific dynamic tags. */ +/*****************************************************************************/ +BOOL DLDYN_c60_process_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i) +{ + switch (dyn_module->dyntab[i].d_tag) + { + /*------------------------------------------------------------------*/ + /* DT_C6000_GSYM_OFFSET: Dynamic symbol table is partitioned into */ + /* local and global symbols. This tag has the */ + /* offset into the dynamic symbol table where */ + /* the global symbol table starts. */ + /*------------------------------------------------------------------*/ + case DT_C6000_GSYM_OFFSET: + dyn_module->gsymtab_offset = dyn_module->dyntab[i].d_un.d_val; +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Found global symbol table: %d\n", + dyn_module->gsymtab_offset); +#endif + return TRUE; + + /*------------------------------------------------------------------*/ + /* DT_C6000_GSTR_OFFSET: Contains the offset into the dynamic */ + /* string table where the global symbol names */ + /* start. */ + /*------------------------------------------------------------------*/ + case DT_C6000_GSTR_OFFSET: + dyn_module->gstrtab_offset = dyn_module->dyntab[i].d_un.d_val; +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Found global string table: %d\n", + dyn_module->gstrtab_offset); +#endif + return TRUE; + + /*------------------------------------------------------------------*/ + /* DT_C6000_DSBT_BASE: Contains address of DSBT in executable or */ + /* shared object. */ + /* We store the tag's location in the dynamic */ + /* module object so that we can update it */ + /* easily after the sections have been */ + /* allocated (tag value is relocated). */ + /*------------------------------------------------------------------*/ + case DT_C6000_DSBT_BASE: + dyn_module->dsbt_base_tagidx = i; + return TRUE; + + /*------------------------------------------------------------------*/ + /* DT_C6000_DSBT_INDEX: Contains specific request for a DSBT */ + /* index. If this object module doesn't get */ + /* the index it requested, then the load will */ + /* fail (object module has already assumed */ + /* that it got the DSBT index it asks for; */ + /* references to the DSBT index will not have */ + /* relocation entries associated with them). */ + /*------------------------------------------------------------------*/ + case DT_C6000_DSBT_INDEX: + dyn_module->dsbt_index = dyn_module->dyntab[i].d_un.d_val; + return TRUE; + + /*------------------------------------------------------------------*/ + /* DT_C6000_DSBT_SIZE: Contains the size of the DSBT allocated for */ + /* this object module. It must be big enough */ + /* to hold the content of the master DSBT. */ + /*------------------------------------------------------------------*/ + case DT_C6000_DSBT_SIZE: + dyn_module->dsbt_size = dyn_module->dyntab[i].d_un.d_val; + return TRUE; + + } + + return FALSE; +} + +/*****************************************************************************/ +/* DLDYN_c60_relocate_dynamic_tag_info() */ +/* */ +/* Update any target specific dynamic tag values that are associated with */ +/* a section address. Return TRUE if the tag value is successfully */ +/* updated or if the tag is not associated with a section address, and */ +/* FALSE if we can't find the sectoin associated with the tag or if the */ +/* tag type is not recognized. */ +/* */ +/*****************************************************************************/ +BOOL DLDYN_c60_relocate_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module, + int32_t i) +{ + switch (dyn_module->dyntab[i].d_tag) + { + /*---------------------------------------------------------------------*/ + /* These tags do not point to sections. */ + /*---------------------------------------------------------------------*/ + case DT_C6000_GSYM_OFFSET: + case DT_C6000_GSTR_OFFSET: + case DT_C6000_DSBT_INDEX: + case DT_C6000_DSBT_SIZE: + return TRUE; + + /*---------------------------------------------------------------------*/ + /* DT_C6000_DSBT_BASE: This tag value provides the virtual address of */ + /* the .dsbt section. We will go find the program */ + /* header entry associated with the DSBT section */ + /* and update this tag with the section's run */ + /* address. */ + /*---------------------------------------------------------------------*/ + case DT_C6000_DSBT_BASE: + return DLIMP_update_dyntag_section_address(dyn_module, i); + } + + DLIF_error(DLET_MISC, "Invalid dynamic tag encountered, %d\n", + (int)dyn_module->dyntab[i].d_tag); + return FALSE; +} + +/*****************************************************************************/ +/* c60_process_eiosabi() */ +/* */ +/* Process the EI_OSABI value. Verify that the OSABI is supported and set */ +/* any variables which depend on the OSABI. */ +/*****************************************************************************/ +BOOL DLDYN_c60_process_eiosabi(DLIMP_Dynamic_Module* dyn_module) +{ + uint8_t osabi = dyn_module->fhdr.e_ident[EI_OSABI]; + + if (dyn_module->relocatable) + { + /*-------------------------------------------------------------------*/ + /* ELFOSABI_C6000_ELFABI - C6x Baremetal ABI */ + /*-------------------------------------------------------------------*/ + if (osabi == ELFOSABI_C6000_ELFABI) + return TRUE; + +#if 0 + /*-------------------------------------------------------------------*/ + /* ELFOSABI_C6000_LINUX - C6x Linux ABI */ + /* presently unsupported */ + /*-------------------------------------------------------------------*/ + if (osabi == ELFOSABI_C6000_LINUX) + return TRUE; +#endif + } + else + { + /*-------------------------------------------------------------------*/ + /* Static executables should have an OSABI of NONE. */ + /*-------------------------------------------------------------------*/ + if (osabi == ELFOSABI_NONE) + return TRUE; + } + + return FALSE; +} + +#endif diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h new file mode 100644 index 0000000..da99604 --- /dev/null +++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h @@ -0,0 +1,53 @@ +/* +* c60_dynamic.h +* +* Interface into C6x-specific dynamic loader functionality +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef DLOAD_C60_H +#define DLOAD_C60_H + +#include "dload.h" + +BOOL DLDYN_c60_process_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i); +BOOL DLDYN_c60_process_eiosabi(DLIMP_Dynamic_Module* dyn_module); +BOOL DLDYN_c60_relocate_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module, int32_t i); + +#define T_INTSZ 32 +#define T_CHARSZ 8 +#define MEM_INC 8 +#define PTR_SZ 32 + +#endif diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h new file mode 100644 index 0000000..418db17 --- /dev/null +++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h @@ -0,0 +1,160 @@ +/* +* c60_elf32.h +* +* C6x-specific data structures for 32-bit ELF object format files. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef C60_ELF32_H +#define C60_ELF32_H + +#include "elf32.h" + +/*---------------------------------------------------------------------------*/ +/* C6x specific EI_OSABI values */ +/*---------------------------------------------------------------------------*/ +enum +{ + ELFOSABI_C6000_ELFABI = 64, /* C6X Baremetal OSABI */ + ELFOSABI_C6000_LINUX = 65 /* C6X Linux OSABI */ +}; + +/*---------------------------------------------------------------------------*/ +/* File Header Flags (value of "e_flags") */ +/*---------------------------------------------------------------------------*/ +enum +{ + EF_C6000_REL = 0x01 /* Contains static relocations. A ET_EXEC or */ + /* ET_DYN file w/ this flag set can be */ + /* treated as ET_REL during static linking. */ +}; + +/*---------------------------------------------------------------------------*/ +/* Segment Types (value of "p_type") */ +/*---------------------------------------------------------------------------*/ +enum +{ + PT_C6000_PHATTRS = 0x70000000 /* Extended Program Header Attributes*/ +}; + +/*---------------------------------------------------------------------------*/ +/* C6x specific section types */ +/*---------------------------------------------------------------------------*/ +enum +{ + + /*------------------------------------------------------------------------*/ + /* Section types defined by the C6x ELFABI. */ + /* Note: ABI defined section type should be named SHT_C6000_xxx */ + /*------------------------------------------------------------------------*/ + SHT_C6000_UNWIND = 0x70000001, /* Exception Index Table */ + SHT_C6000_PREEMPTMAP = 0x70000002, /* Pre-emption Map */ + + SHT_C6000_ATTRIBUTES = 0x70000003, /* Obj File Compatability Attributes */ + + /*------------------------------------------------------------------------*/ + /* The following section types are not part of C6x ABI. As per the ABI, */ + /* the processor specific values not defined in the ABI are reserved for */ + /* future use. Here we reserve the range 0x7F000000 through 0x7FFFFFFFF */ + /* for the TI specific processor section types. */ + /* Note: TI specific section type should be named SHT_TI_xxx */ + /*------------------------------------------------------------------------*/ + SHT_TI_ICODE = 0x7F000000, /* ICODE representation */ + SHT_TI_XREF = 0x7F000001, /* Symbol cross reference */ + SHT_TI_HANDLER = 0x7F000002, /* Handler function table */ + SHT_TI_INITINFO = 0x7F000003, /* Info for C auto-init of variables */ + SHT_TI_PHATTRS = 0x7F000004 /* Extended Program Header Attributes*/ +}; + +/*****************************************************************************/ +/* C6x-Specific Dynamic Array Tags (C6x ELF ABI Section ??? - AEGUPD) */ +/* NOTE: */ +/* As per GABI a tag whose value is even number indicates a dynamic tag */ +/* that uses d_ptr. Odd number indicates the use of d_val or doesn't use */ +/* neither d_val nor d_ptr. */ +/*****************************************************************************/ +enum +{ + /*------------------------------------------------------------------------*/ + /* OSABI specific tags: */ + /* From 0x6000000D thru 0x6FFFF000 */ + /*------------------------------------------------------------------------*/ + DT_C6000_GSYM_OFFSET = 0x6000000D, /* d_val -- OSABI Specific -- */ + DT_C6000_GSTR_OFFSET = 0x6000000F, /* d_val -- OSABI Specific -- */ + + /*------------------------------------------------------------------------*/ + /* Processor specific tags: */ + /* From 0x70000000 thru 0x7FFFFFFF */ + /*------------------------------------------------------------------------*/ + DT_C6000_DSBT_BASE = 0x70000000, /* d_ptr -- Platform Specific -- */ + DT_C6000_DSBT_SIZE = 0x70000001, /* d_val -- Platform Specific -- */ + DT_C6000_PREEMPTMAP = 0x70000002, /* d_ptr -- Platform Specific -- */ + DT_C6000_DSBT_INDEX = 0x70000003 /* d_val -- Platform Specific -- */ +}; + +/*---------------------------------------------------------------------------*/ +/* C6x Dynamic Relocation Types */ +/*---------------------------------------------------------------------------*/ +typedef enum +{ + R_C6000_NONE = 0, + R_C6000_ABS32 = 1, + R_C6000_ABS16 = 2, + R_C6000_ABS8 = 3, + R_C6000_PCR_S21 = 4, + R_C6000_PCR_S12 = 5, + R_C6000_PCR_S10 = 6, + R_C6000_PCR_S7 = 7, + R_C6000_ABS_S16 = 8, + R_C6000_ABS_L16 = 9, + R_C6000_ABS_H16 = 10, + R_C6000_SBR_U15_B = 11, + R_C6000_SBR_U15_H = 12, + R_C6000_SBR_U15_W = 13, + R_C6000_SBR_S16 = 14, + R_C6000_SBR_L16_B = 15, + R_C6000_SBR_L16_H = 16, + R_C6000_SBR_L16_W = 17, + R_C6000_SBR_H16_B = 18, + R_C6000_SBR_H16_H = 19, + R_C6000_SBR_H16_W = 20, + R_C6000_SBR_GOT_U15_W = 21, + R_C6000_SBR_GOT_L16_W = 22, + R_C6000_SBR_GOT_H16_W = 23, + R_C6000_DSBT_INDEX = 24, + R_C6000_PREL31 = 25, + R_C6000_COPY = 26 +}C60_RELOC_TYPE; + +#endif /* C60_ELF32_H */ diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c new file mode 100644 index 0000000..3c79e35 --- /dev/null +++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c @@ -0,0 +1,1101 @@ +/* +* c60_reloc.c +* +* Process C6x-specific dynamic relocations for core dynamic loader. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include <limits.h> +#include "relocate.h" +#include "symtab.h" +#include "c60_elf32.h" +#include "dload_api.h" +#include "util.h" +#include "dload_endian.h" +#include "c60_reloc.h" + +#define MASK(n,s) (((1 << n) - 1) << s) + +/*---------------------------------------------------------------------------*/ +/* C6x Relocations Supported */ +/* */ +/* See the C6000 ELF ABI Specification for more details. */ +/* */ +/* R_C6000_ABS32 | .field X,32 */ +/* R_C6000_ABS16 | .field X,16 */ +/* R_C6000_ABS8 | .field X,8 */ +/* R_C6000_PCR_S21 | B foo */ +/* CALLP foo, B3 */ +/* R_C6000_PCR_S12 | BNOP foo */ +/* R_C6000_PCR_S10 | BPOS foo, A10 */ +/* BDEC foo, A1 */ +/* R_C6000_PCR_S7 | ADDKPC foo, B3, 4 */ +/* R_C6000_ABS_S16 | MVK sym, A0 */ +/* R_C6000_ABS_L16 | MVKL sym, A0 */ +/* MVKLH sym, A0 */ +/* R_C6000_ABS_H16 | MVKH sym, A0 */ +/* R_C6000_SBR_U15_B | LDB *+B14(sym), A1 */ +/* ADDAB B14, sym, A1 */ +/* R_C6000_SBR_U15_H | LDH *+B14(sym), A1 */ +/* ADDAH B14, sym, A1 */ +/* R_C6000_SBR_U15_W | LDW *+B14(sym), A1 */ +/* ADDAW B14, sym, A1 */ +/* R_C6000_SBR_S16 | MVK sym-$bss, A0 */ +/* R_C6000_SBR_L16_B | MVKL (sym-$bss), A0 */ +/* R_C6000_SBR_L16_H | MVKL (sym-$bss)/2,A0 */ +/* R_C6000_SBR_L16_W | MVKL (sym-$bss)/4,A0 */ +/* R_C6000_SBR_H16_B | MVKH (sym-$bss), A0 */ +/* R_C6000_SBR_H16_H | MVKH (sym-$bss)/2,A0 */ +/* R_C6000_SBR_H16_W | MVKH (sym-$bss)/4,A0 */ +/* R_C6000_SBR_GOT_U15_W | LDW *+B14[GOT(sym)],A0 */ +/* R_C6000_SBR_GOT_L16_W | MVKL $DPR_GOT(sym), A0 */ +/* R_C6000_SBR_GOT_H16_W | MVKH $DPR_GOT(sym), A0 */ +/* R_C6000_DSBT_INDEX | LDW *+B14[$DSBT_index()], DP */ +/* */ +/*---------------------------------------------------------------------------*/ + +/*****************************************************************************/ +/* WRITE_RELOC_R() - Perform a relocation into a buffered segment. */ +/*****************************************************************************/ +static void write_reloc_r(uint8_t* buffered_segment, + uint32_t segment_offset, + int r_type, uint32_t r) +{ + uint32_t* rel_field_ptr = (uint32_t*)(buffered_segment + segment_offset); + +#if LOADER_DEBUG + /*------------------------------------------------------------------------*/ + /* Print some details about the relocation we are about to process. */ + /*------------------------------------------------------------------------*/ + if(debugging_on) + { + DLIF_trace("RWRT: segment_offset: %d\n", segment_offset); + DLIF_trace("RWRT: buffered_segment: 0x%x\n", + (uint32_t)buffered_segment); + DLIF_trace("RWRT: rel_field_ptr: 0x%x\n", (uint32_t)rel_field_ptr); + DLIF_trace("RWRT: result: 0x%x\n", r); + } +#endif + + + /*------------------------------------------------------------------------*/ + /* Given the relocation type, carry out relocation into a 4 byte packet */ + /* within the buffered segment. */ + /*------------------------------------------------------------------------*/ + switch(r_type) + { + case R_C6000_ABS32: + *rel_field_ptr = r; + break; + case R_C6000_PREL31: + *rel_field_ptr = (*rel_field_ptr & ~MASK(30,0)) | r; + break; + case R_C6000_ABS16: + *((uint16_t*)(buffered_segment + segment_offset)) = r; + break; + case R_C6000_ABS8: + *((uint8_t*)(buffered_segment + segment_offset)) = r; + break; + case R_C6000_PCR_S21: + *rel_field_ptr = (*rel_field_ptr & ~MASK(21,7)) | (r << 7); + break; + case R_C6000_PCR_S12: + *rel_field_ptr = (*rel_field_ptr & ~MASK(12,16)) | (r << 16); + break; + case R_C6000_PCR_S10: + *rel_field_ptr = (*rel_field_ptr & ~MASK(10,13)) | (r << 13); + break; + case R_C6000_PCR_S7: + *rel_field_ptr = (*rel_field_ptr & ~MASK(7,16)) | (r << 16); + break; + + case R_C6000_ABS_S16: + *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7); + break; + case R_C6000_ABS_L16: + *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7); + break; + case R_C6000_ABS_H16: + *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7); + break; + + case R_C6000_SBR_U15_B: + *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8); + break; + case R_C6000_SBR_U15_H: + *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8); + break; + case R_C6000_SBR_U15_W: + case R_C6000_DSBT_INDEX: + *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8); + break; + + case R_C6000_SBR_S16: + case R_C6000_SBR_L16_B: + case R_C6000_SBR_L16_H: + case R_C6000_SBR_L16_W: + case R_C6000_SBR_H16_B: + case R_C6000_SBR_H16_H: + case R_C6000_SBR_H16_W: + *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7); + break; + + /*---------------------------------------------------------------------*/ + /* Linux "import-as-own" copy relocations are not yet supported. */ + /*---------------------------------------------------------------------*/ + case R_C6000_COPY: + + default: + DLIF_error(DLET_RELOC, + "write_reloc_r called with invalid relocation type!\n"); + } + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("reloc_field 0x%x\n", *rel_field_ptr); +#endif +} + +/*****************************************************************************/ +/* PACK_RESULT() - Pack the result of a relocation calculation for storage */ +/* in the relocation field. */ +/*****************************************************************************/ +static int32_t pack_result(int32_t unpacked_result, int r_type) +{ + switch(r_type) + { + case R_C6000_ABS32: + case R_C6000_ABS16: + case R_C6000_ABS8: + case R_C6000_ABS_S16: + case R_C6000_ABS_L16: + case R_C6000_SBR_U15_B: + case R_C6000_SBR_S16: + case R_C6000_SBR_L16_B: + return unpacked_result; + + case R_C6000_SBR_U15_H: + case R_C6000_SBR_L16_H: + case R_C6000_PREL31: + return unpacked_result >> 1; + + case R_C6000_PCR_S21: + case R_C6000_PCR_S12: + case R_C6000_PCR_S10: + case R_C6000_PCR_S7: + case R_C6000_SBR_U15_W: + case R_C6000_SBR_L16_W: + case R_C6000_DSBT_INDEX: + return unpacked_result >> 2; + + case R_C6000_ABS_H16: + case R_C6000_SBR_H16_B: + return unpacked_result >> 16; + + case R_C6000_SBR_H16_H: + return unpacked_result >> 17; + + case R_C6000_SBR_H16_W: + return unpacked_result >> 18; + + /*---------------------------------------------------------------------*/ + /* Linux "import-as-own" copy relocations are not yet supported. */ + /*---------------------------------------------------------------------*/ + case R_C6000_COPY: + + default: + DLIF_error(DLET_RELOC, + "pack_result called with invalid relocation type!\n"); + return 0; + } +} + +/*****************************************************************************/ +/* MASK_RESULT() - Mask the result of a relocation calculation so that it */ +/* fits the size of the relocation type's field. */ +/*****************************************************************************/ +static int32_t mask_result(int32_t unmasked_result, int r_type) +{ + switch(r_type) + { + case R_C6000_ABS8: + return unmasked_result & 0xFF; + + case R_C6000_ABS32: + return unmasked_result; + + case R_C6000_ABS16: + case R_C6000_ABS_S16: + case R_C6000_ABS_L16: + case R_C6000_ABS_H16: + case R_C6000_SBR_S16: + case R_C6000_SBR_L16_B: + case R_C6000_SBR_L16_H: + case R_C6000_SBR_L16_W: + case R_C6000_SBR_H16_B: + case R_C6000_SBR_H16_H: + case R_C6000_SBR_H16_W: + return unmasked_result & 0xFFFF; + + case R_C6000_PCR_S21: + return unmasked_result & 0x1FFFFF; + + case R_C6000_PCR_S12: + return unmasked_result & 0xFFF; + + case R_C6000_PCR_S10: + return unmasked_result & 0x3FF; + + case R_C6000_PCR_S7: + return unmasked_result & 0x7F; + + case R_C6000_SBR_U15_B: + case R_C6000_SBR_U15_H: + case R_C6000_SBR_U15_W: + case R_C6000_DSBT_INDEX: + return unmasked_result & 0x7FFF; + + case R_C6000_PREL31: + return unmasked_result & 0x7FFFFFFF; + + /*---------------------------------------------------------------------*/ + /* Linux "import-as-own" copy relocations are not yet supported. */ + /*---------------------------------------------------------------------*/ + case R_C6000_COPY: + + default: + DLIF_error(DLET_RELOC, + "mask_result called with invalid relocation type!\n"); + return 0; + } +} + +/*****************************************************************************/ +/* REL_OVERFLOW() */ +/* */ +/* Check relocation value against the range associated with a given */ +/* relocation type field size and signedness. */ +/* */ +/*****************************************************************************/ +static BOOL rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value) +{ + /*------------------------------------------------------------------------*/ + /* Select appropriate range check based on relocation type. */ + /*------------------------------------------------------------------------*/ + switch(r_type) + { + case R_C6000_ABS16: return ((reloc_value > 65535) || + (reloc_value < -32768)); + case R_C6000_ABS8: return ((reloc_value > 255) || + (reloc_value < -128)); + case R_C6000_PCR_S21: return ((reloc_value >= 0x400000) || + (reloc_value < -0x400000)); + case R_C6000_PCR_S12: return ((reloc_value >= 0x2000) || + (reloc_value < -0x2000)); + case R_C6000_PCR_S10: return ((reloc_value >= 0x800) || + (reloc_value < -0x800)); + case R_C6000_PCR_S7: return ((reloc_value >= 0x100) || + (reloc_value < -0x100)); + case R_C6000_SBR_S16: + case R_C6000_ABS_S16: return ((reloc_value >= 0x8000) || + (reloc_value < -0x8000)); + case R_C6000_SBR_U15_B: return (((uint32_t)reloc_value) >= 0x8000); + case R_C6000_SBR_U15_H: return (((uint32_t)reloc_value) >= 0xFFFF); + case R_C6000_DSBT_INDEX: + case R_C6000_SBR_U15_W: return (((uint32_t)reloc_value) >= 0x1FFFD); + + + /*---------------------------------------------------------------------*/ + /* Some relocation types suppress overflow checking at link-time. */ + /*---------------------------------------------------------------------*/ + case R_C6000_ABS_L16: + case R_C6000_ABS_H16: + case R_C6000_SBR_L16_B: + case R_C6000_SBR_L16_H: + case R_C6000_SBR_L16_W: + case R_C6000_SBR_H16_B: + case R_C6000_SBR_H16_H: + case R_C6000_SBR_H16_W: + return 0; + + /*---------------------------------------------------------------------*/ + /* 32-bit relocation field values are not checked for overflow. */ + /*---------------------------------------------------------------------*/ + case R_C6000_ABS32: + case R_C6000_PREL31: + return 0; + + /*---------------------------------------------------------------------*/ + /* If relocation type did not appear in the above switch, then we */ + /* didn't expect to see it. */ + /*---------------------------------------------------------------------*/ + default: + DLIF_error(DLET_RELOC, + "rel_overflow called with invalid relocation type!\n"); + } + + return 1; +} + +#if LOADER_DEBUG || LOADER_PROFILE +extern int DLREL_relocations; +extern time_t DLREL_total_reloc_time; +#endif + +/*****************************************************************************/ +/* RELOC_DO() - Process a single relocation entry. */ +/*****************************************************************************/ +static void reloc_do(C60_RELOC_TYPE r_type, + uint32_t segment_vaddr, + uint8_t *segment_buffer, + uint32_t addend, + uint32_t symval, + uint32_t spc, + int wrong_endian, + uint32_t base_pointer, + int32_t dsbt_index) +{ + int32_t reloc_value = 0; + +#if LOADER_DEBUG || LOADER_PROFILE + /*------------------------------------------------------------------------*/ + /* In debug mode, keep a count of the number of relocations processed. */ + /* In profile mode, start the clock on a given relocation. */ + /*------------------------------------------------------------------------*/ + int start_time = 0; + if (debugging_on || profiling_on) + { + DLREL_relocations++; + if (profiling_on) start_time = clock(); + } +#endif + + /*------------------------------------------------------------------------*/ + /* Calculate the relocation value according to the rules associated with */ + /* the given relocation type. */ + /*------------------------------------------------------------------------*/ + switch(r_type) + { + /*---------------------------------------------------------------------*/ + /* Straight-Up Address relocations (address references). */ + /*---------------------------------------------------------------------*/ + case R_C6000_ABS32: + case R_C6000_ABS16: + case R_C6000_ABS8: + case R_C6000_ABS_S16: + case R_C6000_ABS_L16: + case R_C6000_ABS_H16: + reloc_value = symval + addend; + break; + + /*---------------------------------------------------------------------*/ + /* PC-Relative relocations (calls and branches). */ + /*---------------------------------------------------------------------*/ + case R_C6000_PCR_S21: + case R_C6000_PCR_S12: + case R_C6000_PCR_S10: + case R_C6000_PCR_S7: + { + /*------------------------------------------------------------------*/ + /* Add SPC to segment address to get the PC. Mask for exec-packet */ + /* boundary. */ + /*------------------------------------------------------------------*/ + int32_t opnd_p = (spc + segment_vaddr) & 0xffffffe0; + reloc_value = symval + addend - opnd_p; + break; + } + + /*---------------------------------------------------------------------*/ + /* "Place"-relative relocations (TDEH). */ + /*---------------------------------------------------------------------*/ + /* These relocations occur in data and refer to a label that occurs */ + /* at some signed 32-bit offset from the place where the relocation */ + /* occurs. */ + /*---------------------------------------------------------------------*/ + case R_C6000_PREL31: + { + /*------------------------------------------------------------------*/ + /* Compute location of relocation entry and subtract it from the */ + /* address of the location being referenced (it is computed very */ + /* much like a PC-relative relocation, but it occurs in data and */ + /* is called a "place"-relative relocation). */ + /*------------------------------------------------------------------*/ + /* If this is an Elf32_Rel type relocation, then addend is assumed */ + /* to have been scaled when it was unpacked (field << 1). */ + /*------------------------------------------------------------------*/ + /* For Elf32_Rela type relocations the addend is assumed to be a */ + /* signed 32-bit integer value. */ + /*------------------------------------------------------------------*/ + /* Offset is not fetch-packet relative; doesn't need to be masked. */ + /*------------------------------------------------------------------*/ + int32_t opnd_p = (spc + segment_vaddr); + reloc_value = symval + addend - opnd_p; + break; + } + + /*---------------------------------------------------------------------*/ + /* Static-Base Relative relocations (near-DP). */ + /*---------------------------------------------------------------------*/ + case R_C6000_SBR_U15_B: + case R_C6000_SBR_U15_H: + case R_C6000_SBR_U15_W: + case R_C6000_SBR_S16: + case R_C6000_SBR_L16_B: + case R_C6000_SBR_L16_H: + case R_C6000_SBR_L16_W: + case R_C6000_SBR_H16_B: + case R_C6000_SBR_H16_H: + case R_C6000_SBR_H16_W: + reloc_value = symval + addend - base_pointer; + break; + + /*---------------------------------------------------------------------*/ + /* R_C6000_DSBT_INDEX - uses value assigned by the dynamic loader to */ + /* be the DSBT index for this module as a scaled offset when */ + /* referencing the DSBT. The DSBT base address is in symval and the */ + /* static base is in base_pointer. DP-relative offset to slot in */ + /* DSBT is the offset of the DSBT relative to the DP plus the */ + /* scaled DSBT index into the DSBT. */ + /*---------------------------------------------------------------------*/ + case R_C6000_DSBT_INDEX: + reloc_value = ((symval + addend) - base_pointer) + (dsbt_index << 2); + break; + + /*---------------------------------------------------------------------*/ + /* Linux "import-as-own" copy relocation: after DSO initialization, */ + /* copy the named object from the DSO into the executable's BSS */ + /*---------------------------------------------------------------------*/ + /* Linux "import-as-own" copy relocations are not yet supported. */ + /*---------------------------------------------------------------------*/ + case R_C6000_COPY: + + /*---------------------------------------------------------------------*/ + /* Unrecognized relocation type. */ + /*---------------------------------------------------------------------*/ + default: + DLIF_error(DLET_RELOC, + "reloc_do called with invalid relocation type!\n"); + break; + } + + /*------------------------------------------------------------------------*/ + /* Overflow checking. Is relocation value out of range for the size and */ + /* type of the current relocation? */ + /*------------------------------------------------------------------------*/ + if (rel_overflow(r_type, reloc_value)) + DLIF_error(DLET_RELOC, "relocation overflow!\n"); + + /*------------------------------------------------------------------------*/ + /* Move relocation value to appropriate offset for relocation field's */ + /* location. */ + /*------------------------------------------------------------------------*/ + reloc_value = pack_result(reloc_value, r_type); + + /*------------------------------------------------------------------------*/ + /* Mask packed result to the size of the relocation field. */ + /*------------------------------------------------------------------------*/ + reloc_value = mask_result(reloc_value, r_type); + + /*------------------------------------------------------------------------*/ + /* If necessary, Swap endianness of data at relocation address. */ + /*------------------------------------------------------------------------*/ + if (wrong_endian) + DLIMP_change_endian32((int32_t*)(segment_buffer + spc)); + + /*------------------------------------------------------------------------*/ + /* Write the relocated 4-byte packet back to the segment buffer. */ + /*------------------------------------------------------------------------*/ + write_reloc_r(segment_buffer, spc, r_type, reloc_value); + + /*------------------------------------------------------------------------*/ + /* Change endianness of segment address back to original. */ + /*------------------------------------------------------------------------*/ + if (wrong_endian) + DLIMP_change_endian32((int32_t*)(segment_buffer + spc)); + +#if LOADER_DEBUG || LOADER_PROFILE + /*------------------------------------------------------------------------*/ + /* In profile mode, add elapsed time for this relocation to total time */ + /* spent doing relocations. */ + /*------------------------------------------------------------------------*/ + if (profiling_on) + DLREL_total_reloc_time += (clock() - start_time); + if (debugging_on) + DLIF_trace("reloc_value = 0x%x\n", reloc_value); +#endif +} + +/*****************************************************************************/ +/* REL_UNPACK_ADDEND() */ +/* */ +/* Unpack addend value from the relocation field. */ +/* */ +/*****************************************************************************/ +static void rel_unpack_addend(C60_RELOC_TYPE r_type, + uint8_t *address, + uint32_t *addend) +{ + /*------------------------------------------------------------------------*/ + /* C6000 does not support Elf32_Rel type relocations in the dynamic */ + /* loader core. We will emit an internal error and abort until this */ + /* support is added. I abort here because this is necessarily a target- */ + /* specific part of the relocation infrastructure. */ + /*------------------------------------------------------------------------*/ + *addend = 0; + + DLIF_error(DLET_RELOC, + "Internal Error: unpacking addend values from the relocation " + "field is not supported in the C6000 dynamic loader at this " + "time; aborting\n"); + DLIF_exit(1); +} + +/*****************************************************************************/ +/* REL_SWAP_ENDIAN() */ +/* */ +/* Return TRUE if we should change the endianness of a relocation field. */ +/* */ +/*****************************************************************************/ +static BOOL rel_swap_endian(DLIMP_Dynamic_Module *dyn_module, + C60_RELOC_TYPE r_type) +{ + if (dyn_module->wrong_endian) return TRUE; + + return FALSE; +} + +/*****************************************************************************/ +/* REL_CHANGE_ENDIAN() */ +/* */ +/* Change the endianness of the relocation field at the specified address */ +/* in the segment's data. */ +/* */ +/*****************************************************************************/ +static void rel_change_endian(C60_RELOC_TYPE r_type, uint8_t *address) +{ + /*------------------------------------------------------------------------*/ + /* On C6000, all instructions are 32-bits wide. */ + /*------------------------------------------------------------------------*/ + DLIMP_change_endian32((int32_t *)address); +} + +/*****************************************************************************/ +/* READ_REL_TABLE() */ +/* */ +/* Read in an Elf32_Rel type relocation table. This function allocates */ +/* host memory for the table. */ +/* */ +/*****************************************************************************/ +static void read_rel_table(struct Elf32_Rel **rel_table, + int32_t table_offset, + uint32_t relnum, uint32_t relent, + LOADER_FILE_DESC *fd, BOOL wrong_endian) +{ + if (relnum == 0) { *rel_table = NULL; return; } + + *rel_table = (struct Elf32_Rel *)DLIF_malloc(relnum * relent); + DLIF_fseek(fd, table_offset, LOADER_SEEK_SET); + DLIF_fread(*rel_table, relnum, relent, fd); + + if (wrong_endian) + { + int i; + for (i = 0; i < relnum; i++) + DLIMP_change_rel_endian(*rel_table + i); + } +} + +/*****************************************************************************/ +/* PROCESS_REL_TABLE() */ +/* */ +/* Process table of Elf32_Rel type relocations. */ +/* */ +/*****************************************************************************/ +static void process_rel_table(DLOAD_HANDLE handle, + DLIMP_Loaded_Segment* seg, + struct Elf32_Rel *rel_table, + uint32_t relnum, + int32_t *start_relidx, + uint32_t ti_static_base, + DLIMP_Dynamic_Module* dyn_module) +{ + Elf32_Addr seg_start_addr = seg->input_vaddr; + Elf32_Addr seg_end_addr = seg_start_addr + seg->phdr.p_memsz; + BOOL found = FALSE; + int32_t relidx = *start_relidx; + + /*------------------------------------------------------------------------*/ + /* If the given start reloc index is out of range, then start from the */ + /* beginning of the given table. */ + /*------------------------------------------------------------------------*/ + if (relidx >= relnum) relidx = 0; + + /*------------------------------------------------------------------------*/ + /* Spin through Elf32_Rel type relocation table. */ + /*------------------------------------------------------------------------*/ + for ( ; relidx < relnum; relidx++) + { + /*---------------------------------------------------------------------*/ + /* If the relocation offset falls within the segment, process it. */ + /*---------------------------------------------------------------------*/ + if (rel_table[relidx].r_offset >= seg_start_addr && + rel_table[relidx].r_offset < seg_end_addr) + { + Elf32_Addr r_symval = 0; + C60_RELOC_TYPE r_type = + (C60_RELOC_TYPE)ELF32_R_TYPE(rel_table[relidx].r_info); + int32_t r_symid = ELF32_R_SYM(rel_table[relidx].r_info); + + uint8_t *reloc_address = NULL; + uint32_t pc = 0; + uint32_t addend = 0; + + BOOL change_endian = FALSE; + + found = TRUE; + + /*------------------------------------------------------------------*/ + /* If symbol definition is not found, don't do the relocation. */ + /* An error is generated by the lookup function. */ + /*------------------------------------------------------------------*/ + if (!DLSYM_canonical_lookup(handle, r_symid, dyn_module, &r_symval)) + continue; + + /*------------------------------------------------------------------*/ + /* Addend value is stored in the relocation field. */ + /* We'll need to unpack it from the data for the segment that is */ + /* currently being relocated. */ + /*------------------------------------------------------------------*/ + pc = rel_table[relidx].r_offset - seg->input_vaddr; + reloc_address = (uint8_t *)seg->host_address + pc; + + change_endian = rel_swap_endian(dyn_module, r_type); + if (change_endian) + rel_change_endian(r_type, reloc_address); + + rel_unpack_addend( + (C60_RELOC_TYPE)ELF32_R_TYPE(rel_table[relidx].r_info), + reloc_address, &addend); + + /*------------------------------------------------------------------*/ + /* Perform actual relocation. This is a really wide function */ + /* interface and could do with some encapsulation. */ + /*------------------------------------------------------------------*/ + reloc_do(r_type, + seg->phdr.p_vaddr, + seg->host_address, + addend, + r_symval, + pc, + dyn_module->wrong_endian, + ti_static_base, + dyn_module->dsbt_index); + + } + + else if (found) + break; + } +} + +/*****************************************************************************/ +/* READ_RELA_TABLE() */ +/* */ +/* Read in an Elf32_Rela type relocation table. This function allocates */ +/* host memory for the table. */ +/* */ +/*****************************************************************************/ +static void read_rela_table(struct Elf32_Rela **rela_table, + int32_t table_offset, + uint32_t relanum, uint32_t relaent, + LOADER_FILE_DESC *fd, BOOL wrong_endian) +{ + if (relanum == 0) { *rela_table = NULL; return; } + *rela_table = (struct Elf32_Rela *)DLIF_malloc(relanum * relaent); + DLIF_fseek(fd, table_offset, LOADER_SEEK_SET); + DLIF_fread(*rela_table, relanum, relaent, fd); + + if (wrong_endian) + { + int i; + for (i = 0; i < relanum; i++) + DLIMP_change_rela_endian(*rela_table + i); + } +} + +/*****************************************************************************/ +/* PROCESS_RELA_TABLE() */ +/* */ +/* Process a table of Elf32_Rela type relocations. */ +/* */ +/*****************************************************************************/ +static void process_rela_table(DLOAD_HANDLE handle, + DLIMP_Loaded_Segment *seg, + struct Elf32_Rela *rela_table, + uint32_t relanum, + int32_t *start_relidx, + uint32_t ti_static_base, + DLIMP_Dynamic_Module *dyn_module) +{ + Elf32_Addr seg_start_addr = seg->input_vaddr; + Elf32_Addr seg_end_addr = seg_start_addr + seg->phdr.p_memsz; + BOOL found = FALSE; + int32_t relidx = *start_relidx; + + /*-----------------------------------------------------------------------*/ + /* If the given start reloc index is out of range, then start from */ + /* the beginning of the given table. */ + /*-----------------------------------------------------------------------*/ + if (relidx > relanum) relidx = 0; + + /*-----------------------------------------------------------------------*/ + /* Spin through RELA relocation table. */ + /*-----------------------------------------------------------------------*/ + for ( ; relidx < relanum; relidx++) + { + /*-------------------------------------------------------------------*/ + /* If the relocation offset falls within the segment, process it. */ + /*-------------------------------------------------------------------*/ + if (rela_table[relidx].r_offset >= seg_start_addr && + rela_table[relidx].r_offset < seg_end_addr) + { + Elf32_Addr r_symval; + C60_RELOC_TYPE r_type = + (C60_RELOC_TYPE)ELF32_R_TYPE(rela_table[relidx].r_info); + int32_t r_symid = ELF32_R_SYM(rela_table[relidx].r_info); + + found = TRUE; + + /*---------------------------------------------------------------*/ + /* If symbol definition is not found, don't do the relocation. */ + /* An error is generated by the lookup function. */ + /*---------------------------------------------------------------*/ + if (!DLSYM_canonical_lookup(handle, r_symid, dyn_module, &r_symval)) + continue; + + /*---------------------------------------------------------------*/ + /* Perform actual relocation. This is a really wide function */ + /* interface and could do with some encapsulation. */ + /*---------------------------------------------------------------*/ + reloc_do(r_type, + seg->phdr.p_vaddr, + seg->host_address, + rela_table[relidx].r_addend, + r_symval, + rela_table[relidx].r_offset - seg->input_vaddr, + dyn_module->wrong_endian, + ti_static_base, + dyn_module->dsbt_index); + } + + else if (found) + break; + } +} + +/*****************************************************************************/ +/* PROCESS_GOT_RELOCS() */ +/* */ +/* Process all GOT relocations. It is possible to have both Elf32_Rel */ +/* and Elf32_Rela type relocations in the same file, so we handle tham */ +/* both. */ +/* */ +/*****************************************************************************/ +static void process_got_relocs(DLOAD_HANDLE handle, + struct Elf32_Rel* rel_table, uint32_t relnum, + struct Elf32_Rela* rela_table, uint32_t relanum, + DLIMP_Dynamic_Module* dyn_module) +{ + DLIMP_Loaded_Segment *seg = + (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf); + uint32_t num_segs = dyn_module->loaded_module->loaded_segments.size; + int32_t rel_relidx = 0; + int32_t rela_relidx = 0; + uint32_t seg_idx = 0; + uint32_t ti_static_base = 0; + + /*------------------------------------------------------------------------*/ + /* Get the value of the static base (__TI_STATIC_BASE) which will be */ + /* passed into the relocation table processing functions. */ + /*------------------------------------------------------------------------*/ + if (!DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab, + dyn_module->symnum, &ti_static_base)) + DLIF_error(DLET_RELOC, "Could not resolve value of __TI_STATIC_BASE\n"); + + /*------------------------------------------------------------------------*/ + /* Process relocations segment by segment. */ + /*------------------------------------------------------------------------*/ + for (seg_idx = 0; seg_idx < num_segs; seg_idx++) + { + /*---------------------------------------------------------------------*/ + /* Relocations should not occur in uninitialized segments. */ + /*---------------------------------------------------------------------*/ + if (!seg[seg_idx].phdr.p_filesz) continue; + + if (rela_table) + process_rela_table(handle, (seg + seg_idx), + rela_table, relanum, &rela_relidx, + ti_static_base, dyn_module); + + if (rel_table) + process_rel_table(handle, (seg + seg_idx), + rel_table, relnum, &rel_relidx, + ti_static_base, dyn_module); + } +} + +/*****************************************************************************/ +/* PROCESS_PLTGOT_RELOCS() */ +/* */ +/* Process all PLTGOT relocation entries. The PLTGOT relocation table */ +/* can be either Elf32_Rel or Elf32_Rela type. All PLTGOT relocations */ +/* ar guaranteed to belong to the same segment. */ +/* */ +/*****************************************************************************/ +static void process_pltgot_relocs(DLOAD_HANDLE handle, + void* plt_reloc_table, + int reltype, + uint32_t pltnum, + DLIMP_Dynamic_Module* dyn_module) +{ + Elf32_Addr r_offset = (reltype == DT_REL) ? + ((struct Elf32_Rel *)plt_reloc_table)->r_offset : + ((struct Elf32_Rela *)plt_reloc_table)->r_offset; + + DLIMP_Loaded_Segment* seg = + (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf); + + uint32_t num_segs = dyn_module->loaded_module->loaded_segments.size; + int32_t plt_relidx = 0; + uint32_t seg_idx = 0; + uint32_t ti_static_base = 0; + + /*------------------------------------------------------------------------*/ + /* Get the value of the static base (__TI_STATIC_BASE) which will be */ + /* passed into the relocation table processing functions. */ + /*------------------------------------------------------------------------*/ + if (!DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab, + dyn_module->symnum, &ti_static_base)) + DLIF_error(DLET_RELOC, "Could not resolve value of __TI_STATIC_BASE\n"); + + /*------------------------------------------------------------------------*/ + /* For each segment s, check if the relocation falls within s. If so, */ + /* then all other relocations are guaranteed to fall with s. Process */ + /* all relocations and then return. */ + /*------------------------------------------------------------------------*/ + for (seg_idx = 0; seg_idx < num_segs; seg_idx++) + { + Elf32_Addr seg_start_addr = seg[seg_idx].input_vaddr; + Elf32_Addr seg_end_addr = seg_start_addr + seg[seg_idx].phdr.p_memsz; + + /*---------------------------------------------------------------------*/ + /* Relocations should not occur in uninitialized segments. */ + /*---------------------------------------------------------------------*/ + if(!seg[seg_idx].phdr.p_filesz) continue; + + if (r_offset >= seg_start_addr && + r_offset < seg_end_addr) + { + if (reltype == DT_REL) + process_rel_table(handle, (seg + seg_idx), + (struct Elf32_Rel *)plt_reloc_table, + pltnum, &plt_relidx, + ti_static_base, dyn_module); + else + process_rela_table(handle, (seg + seg_idx), + (struct Elf32_Rela *)plt_reloc_table, + pltnum, &plt_relidx, + ti_static_base, dyn_module); + + break; + } + } +} + +/*****************************************************************************/ +/* RELOCATE() - Perform RELA and REL type relocations for given ELF object */ +/* file that we are in the process of loading and relocating. */ +/*****************************************************************************/ +void DLREL_c60_relocate(DLOAD_HANDLE handle, + LOADER_FILE_DESC *fd, DLIMP_Dynamic_Module *dyn_module) +{ + struct Elf32_Dyn *dyn_nugget = dyn_module->dyntab; + struct Elf32_Rela *rela_table = NULL; + struct Elf32_Rel *rel_table = NULL; + struct Elf32_Rela *rela_plt_table = NULL; + struct Elf32_Rel *rel_plt_table = NULL; + + /*------------------------------------------------------------------------*/ + /* Read the size of the relocation table (DT_RELASZ) and the size per */ + /* relocation (DT_RELAENT) from the dynamic segment. */ + /*------------------------------------------------------------------------*/ + uint32_t relasz = DLIMP_get_first_dyntag(DT_RELASZ, dyn_nugget); + uint32_t relaent = DLIMP_get_first_dyntag(DT_RELAENT, dyn_nugget); + uint32_t relanum = 0; + + /*------------------------------------------------------------------------*/ + /* Read the size of the relocation table (DT_RELSZ) and the size per */ + /* relocation (DT_RELENT) from the dynamic segment. */ + /*------------------------------------------------------------------------*/ + uint32_t relsz = DLIMP_get_first_dyntag(DT_RELSZ, dyn_nugget); + uint32_t relent = DLIMP_get_first_dyntag(DT_RELENT, dyn_nugget); + uint32_t relnum = 0; + + /*------------------------------------------------------------------------*/ + /* Read the size of the relocation table (DT_PLTRELSZ) and the type of */ + /* of the PLTGOT relocation table (DT_PLTREL): one of DT_REL or DT_RELA */ + /*------------------------------------------------------------------------*/ + uint32_t pltrelsz = DLIMP_get_first_dyntag(DT_PLTRELSZ, dyn_nugget); + int pltreltyp = DLIMP_get_first_dyntag(DT_PLTREL, dyn_nugget); + uint32_t pltnum = 0; + + /*------------------------------------------------------------------------*/ + /* Find/record DSBT index associated with this module. */ + /*------------------------------------------------------------------------*/ + if (is_dsbt_module(dyn_module) && + (dyn_module->dsbt_index == DSBT_INDEX_INVALID)) + dyn_module->dsbt_index = + DLIF_get_dsbt_index(dyn_module->loaded_module->file_handle); + + /*------------------------------------------------------------------------*/ + /* Read the PLTGOT relocation table from the file */ + /* The PLTGOT table is a subsection at the end of either the DT_REL or */ + /* DT_RELA table. The size of the table it belongs to DT_REL(A)SZ */ + /* includes the size of the PLTGOT table. So it must be adjusted so that */ + /* the GOT relocation tables only contain actual GOT relocations. */ + /*------------------------------------------------------------------------*/ + if (pltrelsz != INT_MAX && pltrelsz != 0) + { + if (pltreltyp == DT_REL) + { + pltnum = pltrelsz/relent; + relsz -= pltrelsz; + read_rel_table((&rel_plt_table), + DLIMP_get_first_dyntag(DT_JMPREL, dyn_nugget), + pltnum, relent, fd, dyn_module->wrong_endian); + } + + else if (pltreltyp == DT_RELA) + { + pltnum = pltrelsz/relaent; + relasz -= pltrelsz; + read_rela_table((&rela_plt_table), + DLIMP_get_first_dyntag(DT_JMPREL, dyn_nugget), + pltnum, relaent, fd, dyn_module->wrong_endian); + } + + else + { + DLIF_error(DLET_RELOC, + "DT_PLTREL is invalid: must be either %d or %d\n", + DT_REL, DT_RELA); + } + } + + /*------------------------------------------------------------------------*/ + /* Read the DT_RELA GOT relocation table from the file */ + /*------------------------------------------------------------------------*/ + if (relasz != INT_MAX && relasz != 0) + { + relanum = relasz/relaent; + read_rela_table(&rela_table, DLIMP_get_first_dyntag(DT_RELA, dyn_nugget), + relanum, relaent, fd, dyn_module->wrong_endian); + } + + /*------------------------------------------------------------------------*/ + /* Read the DT_REL GOT relocation table from the file */ + /*------------------------------------------------------------------------*/ + if (relsz != INT_MAX && relsz != 0) + { + relnum = relsz/relent; + read_rel_table(&rel_table, DLIMP_get_first_dyntag(DT_REL, dyn_nugget), + relnum, relent, fd, dyn_module->wrong_endian); + } + + /*------------------------------------------------------------------------*/ + /* Process the PLTGOT relocations */ + /*------------------------------------------------------------------------*/ + if (rela_plt_table) + process_pltgot_relocs(handle, rela_plt_table, pltreltyp, pltnum, + dyn_module); + + if (rel_plt_table) + process_pltgot_relocs(handle, rel_plt_table, pltreltyp, pltnum, + dyn_module); + + /*------------------------------------------------------------------------*/ + /* Process the GOT relocations */ + /*------------------------------------------------------------------------*/ + if (rel_table || rela_table) + process_got_relocs(handle, rel_table, relnum, rela_table, relanum, + dyn_module); + + /*-------------------------------------------------------------------------*/ + /* Free memory used for ELF relocation table copies. */ + /*-------------------------------------------------------------------------*/ + if (rela_table) DLIF_free(rela_table); + if (rel_table) DLIF_free(rel_table); + if (rela_plt_table) DLIF_free(rela_plt_table); + if (rel_plt_table) DLIF_free(rel_plt_table); +} + +/*****************************************************************************/ +/* UNIT TESTING INTERFACE */ +/*****************************************************************************/ +#ifdef UNIT_TEST +void unit_c60_reloc_do(C60_RELOC_TYPE r_type, + uint8_t *address_space, + uint32_t addend, uint32_t symval, uint32_t pc, + uint32_t static_base, int wrong_endian, + int32_t dsbt_index) +{ + reloc_do(r_type, (uint32_t)address_space, address_space, + addend, symval, pc, FALSE, static_base, dsbt_index); +} + +#if 0 /* RELA TYPE RELOCATIONS HAVE ADDEND IN RELOCATION ENTRY */ +void unit_c60_rel_unpack_addend(C60_RELOC_TYPE r_type, + uint8_t* address, + uint32_t* addend) +{ + rel_unpack_addend(r_type, address, addend); +} +#endif + +BOOL unit_c60_rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value) +{ + return rel_overflow(r_type, reloc_value); +} +#endif + diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h new file mode 100644 index 0000000..8ccd60e --- /dev/null +++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h @@ -0,0 +1,30 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +void DLREL_c60_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module); diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp new file mode 100644 index 0000000..acde023 --- /dev/null +++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp @@ -0,0 +1,825 @@ +/* +* test_c60_reloc.cpp +* +* C6x Relocation Unit Tests. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include "test_c60_reloc.h" +#include <stdlib.h> +#include <stdio.h> + +/*****************************************************************************/ +/* C60_TestRelocDo */ +/* */ +/* Tests the C60 version of reloc_do. In cases where multiple relocation */ +/* types are implemented in the same way, only one type is tested. For */ +/* instance, R_C6000_xxx, R_C6000_yyy, and R_C6000_zzz are implemented in */ +/* the exact same way and, therefore, only R_C6000_xxx is tested. */ +/* */ +/* Each test follows the same flow: */ +/* 1. A valid instruction is constructed for the relocation type being */ +/* tested. */ +/* 2. Addend, symbol value, and pc are then created. */ +/* (NOTE: static base is not needed, and so 0 is passed. Also, same */ +/* endianness is assumed.) */ +/* 3. reloc_do() is called */ +/* 4. The result is checked. */ +/* 5. Repeat if variations should be considered. */ +/* */ +/*****************************************************************************/ +//void C60_TestRelocDo::test_R_C6000_NONE() { } + +void C60_TestRelocDo::test_R_C6000_ABS32() +{ + uint32_t address_space = 0x0; + uint32_t addend = 0x4; + uint32_t symval = 0x2001000; + uint32_t pc = 0x0; + + unit_c60_reloc_do(R_C6000_ABS32, + (uint8_t*) &address_space, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(address_space, 0x2001004); +} + +void C60_TestRelocDo::test_R_C6000_ABS16() +{ + uint16_t address_space = 0x0; + uint32_t addend = 0x4; + uint32_t symval = 0xFFE; + uint32_t pc = 0x0; + + unit_c60_reloc_do(R_C6000_ABS16, + (uint8_t*) &address_space, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(address_space, 0x1002); +} + +void C60_TestRelocDo::test_R_C6000_ABS8() +{ + uint8_t address_space = 0x0; + uint32_t addend = 0x4; + uint32_t symval = 0xE; + uint32_t pc = 0x0; + + unit_c60_reloc_do(R_C6000_ABS8, + &address_space, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(address_space, 0x12); +} + +/*---------------------------------------------------------------------------*/ +/* PC-Relative Relocation Tests */ +/* */ +/* Our relocation handler assumes that the address of 'opcode' is where the */ +/* relocation is. Therefore, when creating a PCR test case, we will compute */ +/* a value for symval and pc in terms of &opcode. */ +/* */ +/*---------------------------------------------------------------------------*/ +void C60_TestRelocDo::test_R_C6000_PCR_S21() +{ + uint32_t opcode = 0x00000010; + uint32_t addend = 0x4; + uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50000; + uint32_t pc = 0x0; + + /* Test #1 -- destination is forward from PC */ + /* PCR21 offset = 0x14001 */ + unit_c60_reloc_do(R_C6000_PCR_S21, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x00a00090); + + /* Test #2 -- symval definition implies offset is negative */ + /* PCR21 offset = 0x1d4001 (signed - negative) */ + opcode = 0x00000010; + symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb0000; + unit_c60_reloc_do(R_C6000_PCR_S21, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0ea00090); +} + +void C60_TestRelocDo::test_R_C6000_PCR_S12() +{ + uint32_t opcode = 0x00002120; /* BNOP */ + uint32_t addend = 0x4; + uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x500; + uint32_t pc = 0x0; + + /* Test #1 -- destination is forward from PC */ + /* PCR12 offset = 0x141 */ + unit_c60_reloc_do(R_C6000_PCR_S12, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x01412120); + + /* Test #2 -- symval definition implies offset is negative */ + /* PCR12 offset = 0xd41 (signed - negative) */ + opcode = 0x00002120; + symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb00; + unit_c60_reloc_do(R_C6000_PCR_S12, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0d412120); +} + +void C60_TestRelocDo::test_R_C6000_PCR_S10() +{ + uint32_t opcode = 0x01001020; /* BDEC */ + uint32_t addend = 0x4; + uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50; + uint32_t pc = 0x0; + + /* Test #1 -- destination is forward from PC */ + /* PCR10 offset = 0x15 */ + unit_c60_reloc_do(R_C6000_PCR_S10, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0102b020); + + /* Test #2 -- symval definition implies offset is negative */ + /* PCR10 offset = 0x355 (signed - negative) */ + opcode = 0x01001020; + symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb0; + unit_c60_reloc_do(R_C6000_PCR_S10, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x017ab020); +} + +void C60_TestRelocDo::test_R_C6000_PCR_S7() +{ + uint32_t opcode = 0x03006160; /* ADDKPC */ + uint32_t addend = 0x4; + uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50; + uint32_t pc = 0x0; + + /* Test #1 -- destination is forward from PC */ + /* PCR7 offset = 0x15 */ + unit_c60_reloc_do(R_C6000_PCR_S7, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x03156160); + + /* Test #2 -- symval definition implies offset is negative */ + /* PCR7 offset = 0x75 (signed - negative) */ + opcode = 0x03006160; + symval = ((uint32_t)&opcode & 0xffffffe0) - 0x30; + unit_c60_reloc_do(R_C6000_PCR_S7, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x03756160); +} + +void C60_TestRelocDo::test_R_C6000_ABS_S16() +{ + uint32_t opcode = 0x03000028; /* MVK */ + uint32_t addend = 0x4; + uint32_t symval = 0xFFE; + uint32_t pc = 0x0; + + unit_c60_reloc_do(R_C6000_ABS_S16, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x03080128); +} + +void C60_TestRelocDo::test_R_C6000_ABS_L16() +{ + uint32_t opcode = 0x03000028; /* MVKL */ + uint32_t addend = 0x4; + uint32_t symval = 0x04560FFE; + uint32_t pc = 0x0; + + unit_c60_reloc_do(R_C6000_ABS_L16, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x03080128); +} + +void C60_TestRelocDo::test_R_C6000_ABS_H16() +{ + uint32_t opcode = 0x03000068; /* MVKH */ + uint32_t addend = 0x4; + uint32_t symval = 0x04560FFE; + uint32_t pc = 0x0; + + unit_c60_reloc_do(R_C6000_ABS_H16, + (uint8_t*) &opcode, + addend, symval, pc, 0, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x03022b68); +} + +void C60_TestRelocDo::test_R_C6000_SBR_U15_B() +{ + uint32_t opcode = 0x0300002c; /* LDB */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x1357); + uint32_t pc = 0x0; + + /* unsigned 15-bit SBR offset = 0x1357 */ + /* encoded in bits 22 - 8 */ + unit_c60_reloc_do(R_C6000_SBR_U15_B, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0313572c); +} + +void C60_TestRelocDo::test_R_C6000_SBR_U15_H() +{ + uint32_t opcode = 0x0300004c; /* LDH */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x2246); + uint32_t pc = 0x0; + + /* unsigned 16-bit SBR offset = 0x2246 */ + /* scaled 15-bit SBR offset = 0x1123 */ + /* encoded in bits 22 - 8 */ + unit_c60_reloc_do(R_C6000_SBR_U15_H, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0311234c); +} + +void C60_TestRelocDo::test_R_C6000_SBR_U15_W() +{ + uint32_t opcode = 0x0300006c; /* LDW */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x448c); + uint32_t pc = 0x0; + + /* unsigned 17-bit SBR offset = 0x448c */ + /* scaled 15-bit SBR offset = 0x1123 */ + /* encoded in bits 22 - 8 */ + unit_c60_reloc_do(R_C6000_SBR_U15_W, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0311236c); +} + +void C60_TestRelocDo::test_R_C6000_SBR_S16() +{ + uint32_t opcode = 0x03000028; /* MVK */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x1357); + uint32_t pc = 0x0; + + /* Test #1 positive signed 16-bit offset */ + /* 16-bit SBR offset = 0x1357 */ + /* encoded in bits 22-7 of opcode */ + unit_c60_reloc_do(R_C6000_SBR_S16, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0309aba8); + + /* Test #2 negative signed 16-bit offset */ + /* 16-bit SBR offset = 0xeca9 (-0x1357) */ + /* encoded in bits 22-7 of opcode */ + symval = (static_base - 0x1357); + unit_c60_reloc_do(R_C6000_SBR_S16, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x037654a8); +} + +void C60_TestRelocDo::test_R_C6000_SBR_L16_B() +{ + uint32_t opcode = 0x03000028; /* MVKL */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x11123); + uint32_t pc = 0x0; + + /* 16-bit SBR offset = 0x1123 */ + /* encoded in bits 22-7 of opcode */ + unit_c60_reloc_do(R_C6000_SBR_L16_B, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x030891a8); +} + +void C60_TestRelocDo::test_R_C6000_SBR_L16_H() +{ + uint32_t opcode = 0x03000028; /* MVKL */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x12246); + uint32_t pc = 0x0; + + /* 17-bit SBR offset = 0x12246 */ + /* scaled SBR offset = 0x9123 */ + /* encoded in bits 22-7 of opcode */ + unit_c60_reloc_do(R_C6000_SBR_L16_H, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x034891a8); +} + +void C60_TestRelocDo::test_R_C6000_SBR_L16_W() +{ + uint32_t opcode = 0x03000028; /* MVKL */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x1448c); + uint32_t pc = 0x0; + + /* 18-bit SBR offset = 0x1448c */ + /* scaled SBR offset = 0x5123 */ + /* encoded in bits 22-7 of opcode */ + unit_c60_reloc_do(R_C6000_SBR_L16_W, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x032891a8); +} + +void C60_TestRelocDo::test_R_C6000_SBR_H16_B() +{ + uint32_t opcode = 0x03000068; /* MVKH */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x357448c); + uint32_t pc = 0x0; + + /* total SBR offset = 0x357448c */ + /* upper 16-bits of SBR offset = 0x357 */ + /* encoded in bits 22-7 of opcode */ + unit_c60_reloc_do(R_C6000_SBR_H16_B, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0301abe8); +} + +void C60_TestRelocDo::test_R_C6000_SBR_H16_H() +{ + uint32_t opcode = 0x03000068; /* MVKH */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x357448c); + uint32_t pc = 0x0; + + /* total SBR offset = 0x357448c */ + /* scaled SBR offset = 0x1aba246 */ + /* upper 16-bits of scaled SBR offset = 0x1ab */ + /* encoded in bits 22-7 of opcode */ + unit_c60_reloc_do(R_C6000_SBR_H16_H, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x0300d5e8); +} + +void C60_TestRelocDo::test_R_C6000_SBR_H16_W() +{ + uint32_t opcode = 0x03000068; /* MVKH */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = (static_base + 0x357448c); + uint32_t pc = 0x0; + + /* total SBR offset = 0x357448c */ + /* scaled SBR offset = 0x0d5d123 */ + /* upper 16-bits of scaled SBR offset = 0x0d5 */ + /* encoded in bits 22-7 of opcode */ + unit_c60_reloc_do(R_C6000_SBR_H16_W, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 0); + + TS_ASSERT_EQUALS(opcode, 0x03006ae8); +} + +/* The DSBT table is accessed via DP-relative addressing with */ +/* an LDW instruction, but the DSBT_INDEX is really an index */ +/* into the DSBT table, the index is scaled to a 4-word offset. */ +void C60_TestRelocDo::test_R_C6000_DSBT_INDEX() +{ + uint32_t opcode = 0x0300006c; /* LDW */ + uint32_t addend = 0x0; + uint32_t static_base = 0x04000000; + uint32_t symval = static_base; + uint32_t pc = 0x0; + + unit_c60_reloc_do(R_C6000_DSBT_INDEX, + (uint8_t*) &opcode, + addend, symval, pc, static_base, 0, 3); + + TS_ASSERT_EQUALS(opcode, 0x0300036c); +} + +/*****************************************************************************/ +/* C60_TestRelUnpackAddend */ +/* */ +/* Tests the C60 rel_unpack_addend function. */ +/* */ +/* In cases where the addends are unpacked in the same way, only one is */ +/* tested. */ +/* */ +/* All tests follow the same flow: */ +/* */ +/* 1. Create a valid instruction for the relocation type, where the addend */ +/* is packed in the instruction. */ +/* 2. Call rel_unpack_addend(). */ +/* 3. Check that the addend is correct. */ +/* */ +/* Relocations may be tested multiple times to handle variations, such as */ +/* positive/negative addends, extra bits depending on the encoding, etc. */ +/* */ +/* NOTE!! C60 ONLY SUPPORTS RELA TYPE RELOCATIONS, SO ADDEND FIELD IS STORED */ +/* IN RELOCATION ENTRY ITSELF. */ +/*****************************************************************************/ +#if 0 +void C60_TestRelUnpackAddend::test_R_C6000_ABS32() +{ + uint32_t address_space=0xFEDCBA9; + uint32_t addend; + + unit_c60_rel_unpack_addend(R_C6000_ABS32, + (uint8_t*)&address_space, + &addend); + + TS_ASSERT_EQUALS(addend, address_space); +} + +void C60_TestRelUnpackAddend::test_R_C6000_ABS16() +{ + uint16_t address_space=0x7FFF; + uint32_t addend; + + unit_c60_rel_unpack_addend(R_C6000_ABS16, + (uint8_t*)&address_space, + &addend); + + TS_ASSERT_EQUALS(addend, 0x7FFF); + + address_space = 0x8000; + + unit_c60_rel_unpack_addend(R_C6000_ABS16, + (uint8_t*)&address_space, + &addend); + + TS_ASSERT_EQUALS(addend, 0xFFFF8000); +} +#endif + + +/*****************************************************************************/ +/* C60_TestRelOverflow */ +/* */ +/* Test the C60 rel_overflow function. */ +/* */ +/* In each case, we test the upper and lower bounds of each relocation type. */ +/* Only relocation types where the overflow is checked in rel_overflow are */ +/* considered. In most cases four tests are performed to test the upper and */ +/* lower bounds (1 pass and 1 fail for each). */ +/* */ +/* NOTE!! HAVEN'T REFACTORED OVERFLOW CHECK OUT OF RELOCATION HANDLERS FOR */ +/* C60, SO OVERFLOW SHOULD BE TESTED AS PART OF THE RELOC DO(???) */ +/* */ +/*****************************************************************************/ +void C60_TestRelOverflow::test_R_C6000_ABS16() +{ + int32_t reloc_val = 0xFFFF; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x10000; + + rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x8000; + + rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x8001; + + rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_ABS8() +{ + int32_t reloc_val = 0xFF; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x100; + + rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x80; + + rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x81; + + rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_PCR_S21() +{ + int32_t reloc_val = 0x3FFFFC; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x400000; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x400000; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x400001; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_PCR_S12() +{ + int32_t reloc_val = 0x1FFC; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x2000; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x2000; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x2001; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_PCR_S10() +{ + int32_t reloc_val = 0x7FC; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x800; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x800; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x801; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_PCR_S7() +{ + int32_t reloc_val = 0xFC; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x100; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x100; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x101; + + rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_SBR_S16() +{ + int32_t reloc_val = 0x7FFF; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x8000; + + rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x8000; + + rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x8001; + + rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_ABS_S16() +{ + int32_t reloc_val = 0x7FFF; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x8000; + + rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); + + reloc_val = -0x8000; + + rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = -0x8001; + + rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_SBR_U15_B() +{ + uint32_t reloc_val = 0x7FFF; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_SBR_U15_B, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x8000; + + rval = unit_c60_rel_overflow(R_C6000_SBR_U15_B, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_SBR_U15_H() +{ + uint32_t reloc_val = 0xFFFE; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_SBR_U15_H, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0xFFFF; + + rval = unit_c60_rel_overflow(R_C6000_SBR_U15_H, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_SBR_U15_W() +{ + uint32_t reloc_val = 0x1FFFC; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_SBR_U15_W, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x1FFFD; + + rval = unit_c60_rel_overflow(R_C6000_SBR_U15_W, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + +void C60_TestRelOverflow::test_R_C6000_DSBT_INDEX() +{ + uint32_t reloc_val = 0x1FFFC; + int rval; + + rval = unit_c60_rel_overflow(R_C6000_DSBT_INDEX, reloc_val); + + TS_ASSERT_EQUALS(rval, 0); + + reloc_val = 0x1FFFD; + + rval = unit_c60_rel_overflow(R_C6000_DSBT_INDEX, reloc_val); + + TS_ASSERT_EQUALS(rval, 1); +} + diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h new file mode 100644 index 0000000..67a437d --- /dev/null +++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h @@ -0,0 +1,101 @@ +/* +* test_c60_reloc.h +* +* Specification of C6x-specific relocation handler unit tests. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef _TEST_C60_RELOC_H_ +#define _TEST_C60_RELOC_H_ +#include "c60_elf32.h" +#include <cxxtest/TestSuite.h> + +extern "C" +{ +extern void unit_c60_reloc_do(C60_RELOC_TYPE r_type, uint8_t* address, + uint32_t addend, uint32_t symval, uint32_t pc, + uint32_t base_pointer, int wrong_endian, int32_t dsbt_index); + +extern void unit_c60_rel_unpack_addend(C60_RELOC_TYPE r_type, + uint8_t* address, + uint32_t* addend); + +extern int unit_c60_rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value); + +} + +class C60_TestRelocDo : public CxxTest::TestSuite +{ + public: + void test_R_C6000_ABS32(); + void test_R_C6000_ABS16(); + void test_R_C6000_ABS8(); + void test_R_C6000_PCR_S21(); + void test_R_C6000_PCR_S12(); + void test_R_C6000_PCR_S10(); + void test_R_C6000_PCR_S7(); + void test_R_C6000_ABS_S16(); + void test_R_C6000_ABS_L16(); + void test_R_C6000_ABS_H16(); + void test_R_C6000_SBR_U15_B(); + void test_R_C6000_SBR_U15_H(); + void test_R_C6000_SBR_U15_W(); + void test_R_C6000_SBR_S16(); + void test_R_C6000_SBR_L16_B(); + void test_R_C6000_SBR_L16_H(); + void test_R_C6000_SBR_L16_W(); + void test_R_C6000_SBR_H16_B(); + void test_R_C6000_SBR_H16_H(); + void test_R_C6000_SBR_H16_W(); + void test_R_C6000_DSBT_INDEX(); +}; + +class C60_TestRelOverflow : public CxxTest::TestSuite +{ + public: + void test_R_C6000_ABS16(); + void test_R_C6000_ABS8(); + void test_R_C6000_PCR_S21(); + void test_R_C6000_PCR_S12(); + void test_R_C6000_PCR_S10(); + void test_R_C6000_PCR_S7(); + void test_R_C6000_SBR_S16(); + void test_R_C6000_ABS_S16(); + void test_R_C6000_SBR_U15_B(); + void test_R_C6000_SBR_U15_H(); + void test_R_C6000_SBR_U15_W(); + void test_R_C6000_DSBT_INDEX(); +}; + +#endif /* _TEST_C60_RELOC_H_ */ diff --git a/src/core/dsp/ocl_load/CMakeLists.txt b/src/core/dsp/ocl_load/CMakeLists.txt new file mode 100644 index 0000000..a459542 --- /dev/null +++ b/src/core/dsp/ocl_load/CMakeLists.txt @@ -0,0 +1,26 @@ +include_directories (. + C60_DLOAD_REL + C60_DLOAD_DYN + DLOAD_SYM + DLOAD + DLOAD_API + DLWRAPPER + ) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -DC60_TARGET -DLOADER_DEBUG -g -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast") + +set(OCL_LOAD_SRC_FILES + ocl_load.c + C60_DLOAD_REL/c60_reloc.c + C60_DLOAD_DYN/c60_dynamic.c + DLOAD_SYM/symtab.c + DLOAD/ArrayList.c + DLOAD/dload.c + DLOAD/elf32.c + DLOAD/dload_endian.c +) + +add_library(oclload STATIC ${OCL_LOAD_SRC_FILES}) + +SET(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib) + diff --git a/src/core/dsp/ocl_load/DLOAD/ArrayList.c b/src/core/dsp/ocl_load/DLOAD/ArrayList.c new file mode 100644 index 0000000..4452bfc --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/ArrayList.c @@ -0,0 +1,122 @@ +/* +* ArrayList.c +* +* Array_List is a C implementation of a C++ vector class. +* +* This class emulates a resizable array along the lines of a C++ +* vector or Java ArrayList class in C, and uses the convention +* of passing a pointer to the current "object" as the first +* argument. +* +* Usage is defined as follows: +* +* Array_List obj; +* AL_initialize(&obj, sizeof(type_name)); +* +* ... +* +* type_name *ptr = (type_name*)(obj.buf); +* for(i = 0; i < AL_size(&obj); i++) +* do_something_to(ptr[i]); +* type_name to_append = ...; +* AL_append(&obj, &to_append); +* +* ... +* +* AL_destroy(&obj); +* +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include <inttypes.h> +#include <string.h> +#include "ArrayList.h" +#include "dload_api.h" + +/*****************************************************************************/ +/* AL_INITIALIZE() - Initialize a newly created Array_List object. */ +/*****************************************************************************/ +void AL_initialize(Array_List* obj, int32_t type_size, int32_t num_elem) +{ + if (num_elem == 0) num_elem = 1; + obj->buf = DLIF_malloc(type_size * num_elem); + obj->type_size = type_size; + obj->size = 0; + obj->buffer_size = num_elem; +} + +/*****************************************************************************/ +/* AL_APPEND() - Append an element to the end of an Array_List. */ +/*****************************************************************************/ +void AL_append(Array_List* obj, void* to_append) +{ + /*------------------------------------------------------------------------*/ + /* If there is already space in the specified buffer for the new data, */ + /* just append it to the end of the data that is already in the buffer. */ + /*------------------------------------------------------------------------*/ + if (obj->size < obj->buffer_size) + memcpy(((uint8_t*)obj->buf) + obj->type_size * ((obj->size)++), to_append, + obj->type_size); + + /*------------------------------------------------------------------------*/ + /* Grow the buffer if we need more space to add the new data to it. */ + /*------------------------------------------------------------------------*/ + else + { + void* old_buffer = obj->buf; + obj->buffer_size *= 2; + obj->buf = DLIF_malloc(obj->buffer_size*obj->type_size); + memcpy(obj->buf,old_buffer,obj->size*obj->type_size); + DLIF_free(old_buffer); + memcpy(((uint8_t*)obj->buf) + obj->type_size *((obj->size)++), to_append, + obj->type_size); + } +} + +/*****************************************************************************/ +/* AL_SIZE() - Get the number of elements in an Array_List. */ +/*****************************************************************************/ +int32_t AL_size(Array_List* obj) +{ + return obj->size; +} + +/*****************************************************************************/ +/* AL_DESTROY() - Free up memory associated with an Array_List that is no */ +/* longer in use. */ +/*****************************************************************************/ +void AL_destroy(Array_List* obj) +{ + DLIF_free(obj->buf); +} diff --git a/src/core/dsp/ocl_load/DLOAD/ArrayList.h b/src/core/dsp/ocl_load/DLOAD/ArrayList.h new file mode 100644 index 0000000..2c03788 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/ArrayList.h @@ -0,0 +1,92 @@ +/* +* ArrayList.h +* +* This implementation of ArrayList is a replacement for the C++ +* vector class in C. +* +* This class emulates a resizable array along the lines of a C++ +* vector or Java ArrayList class in C, and uses the convention +* of passing a pointer to the current "object" as the first +* argument. +* +* Usage is defined as follows: +* +* Array_List obj; +* AL_initialize(&obj, sizeof(type_name)); +* +* ... +* +* type_name *ptr = (type_name*)(obj.buf); +* for(i = 0; i < AL_size(&obj); i++) +* do_something_to(ptr[i]); +* type_name to_append = ...; +* AL_append(&obj, &to_append); +* +* ... +* +* AL_destroy(&obj); +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef ARRAYLIST_H +#define ARRAYLIST_H + +#include <inttypes.h> + +/**********************************************************************/ +/* Array_List - structure type specification. */ +/**********************************************************************/ +typedef struct +{ + void *buf; + int32_t type_size; + int32_t size; + int32_t buffer_size; +} Array_List; + +/*--------------------------------------------------------------------*/ +/* Array_List Member Functions: */ +/* */ +/* AL_initialize() - Initialize a newly created Array_List object. */ +/* AL_append() - Append an element to the end of an Array_List. */ +/* AL_size() - Get number of elements in an Array_List. */ +/* AL_destroy() - Free memory associated with an Array_List that is */ +/* no longer in use. */ +/*--------------------------------------------------------------------*/ +void AL_initialize(Array_List* obj, int32_t type_size, int32_t num_elem); +void AL_append(Array_List* obj, void* to_append); +int32_t AL_size(Array_List* obj); +void AL_destroy(Array_List* obj); + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/Queue.h b/src/core/dsp/ocl_load/DLOAD/Queue.h new file mode 100644 index 0000000..3f85c16 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/Queue.h @@ -0,0 +1,194 @@ +/* +* Queue.h +* +* Interface to Linked List +* ------------------------ +* +* This is an implementation of a type-independent linked list class for C. +* It's basically a template class, but uses macros instead so that it can +* be compiled with a C-only compiler. +* +* To define a linked list class: +* #include "Queue.h" +* TYPE_QUEUE_DEFINITION(object_type,Class_Identifier) +* +* In a separate C file: +* #include "Queue.h" +* TYPE_QUEUE_DEFINITION(object_type,Class_Identifier) +* TYPE_QUEUE_IMPLEMENTATION(object_type,Class_Identifier) +* +* Now, to create a list: +* Class_Identifier_Queue name; +* Get it initialized to zero everywhere somehow, maybe like this: +* Class_Identifier_initialize_queue(&name); +* +* To add to the list: +* Class_Identifier_enqueue(&name, object); +* +* To iterate over the list: +* Class_Identifier_Queue_Node *it = name.front; +* while(it) { do_something_to_(it->value); it = it->next; } +* +* To delete from the list: +* If it's the first node: +* Class_Identifier_dequeue(&name); +* If it's not: +* predecessor_node->next_ptr = deleted_node->next_ptr; +* name.size--; +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef QUEUE_H +#define QUEUE_H + +#include <inttypes.h> +#include "dload_api.h" + +/*****************************************************************************/ +/* TYPE_QUEUE_DEFINITION() - Define structure specifications for a linked */ +/* list of t_name objects. */ +/*****************************************************************************/ +#define TYPE_QUEUE_DEFINITION(t, t_name) \ +struct t_name##_Queue_Node_ \ +{ \ + t value; \ + struct t_name##_Queue_Node_* next_ptr; \ +}; \ +typedef struct t_name##_Queue_Node_ t_name##_Queue_Node; \ + \ +typedef struct \ +{ \ + t_name##_Queue_Node* front_ptr; \ + t_name##_Queue_Node* back_ptr; \ + int32_t size; \ +} t_name##_Queue; \ + \ +extern void t_name##_initialize_queue(t_name##_Queue* queue); \ +extern void t_name##_enqueue(t_name##_Queue* queue, t to_enqueue); \ +extern t t_name##_dequeue(t_name##_Queue* queue); \ +extern void t_name##_remove(t_name##_Queue* queue, t to_remove); + +/*****************************************************************************/ +/* TYPE_QUEUE_INITIALIZER() - Define the initializer to initialize Queues. */ +/*****************************************************************************/ +#define TYPE_QUEUE_INITIALIZER {NULL, NULL, 0} + + +/*****************************************************************************/ +/* TYPE_QUEUE_IMPLEMENTATION() - Define member functions of new linked list */ +/* "class" of t_name objects. */ +/* */ +/* <type>_initialize_queue() - clears the queue */ +/* <type>_enqueue() - adds a <t> type object to the end of the queue */ +/* <type>_dequeue() - remove a <t> type object from the front of the queue */ +/* and provide access to it to the caller */ +/* <type>_remove() - find and remove a <t> type object from the queue */ +/*****************************************************************************/ +#define TYPE_QUEUE_IMPLEMENTATION(t, t_name) \ +void t_name##_initialize_queue (t_name##_Queue* queue) \ +{ \ + queue->front_ptr = queue->back_ptr = NULL; \ + queue->size = 0; \ +} \ +void t_name##_enqueue(t_name##_Queue* queue, t to_enqueue) \ +{ \ + queue->size++; \ + \ + if(!queue->back_ptr) \ + queue->back_ptr = queue->front_ptr = \ + (t_name##_Queue_Node*) \ + (DLIF_malloc(sizeof(t_name##_Queue_Node))); \ + else \ + { \ + queue->back_ptr->next_ptr = \ + (t_name##_Queue_Node*)(DLIF_malloc( \ + sizeof(t_name##_Queue_Node))); \ + queue->back_ptr = queue->back_ptr->next_ptr; \ + } \ + \ + queue->back_ptr->value = to_enqueue; \ + queue->back_ptr->next_ptr = NULL; \ +} \ + \ +t t_name##_dequeue(t_name##_Queue* queue) \ +{ \ + t to_ret; \ + t_name##_Queue_Node* next_ptr = NULL; \ + \ + if (!queue->size) return (t) NULL; \ + \ + next_ptr = queue->front_ptr->next_ptr; \ + queue->size--; \ + to_ret = queue->front_ptr->value; \ + DLIF_free((void*)(queue->front_ptr)); \ + \ + if(!queue->size) \ + queue->front_ptr = queue->back_ptr = NULL; \ + else \ + queue->front_ptr = next_ptr; \ + \ + return to_ret; \ +} \ + \ +void t_name##_remove(t_name##_Queue* queue, t to_remove) \ +{ \ + t_name##_Queue_Node* prev_ptr = NULL; \ + t_name##_Queue_Node* curr_ptr = queue->front_ptr; \ + t_name##_Queue_Node* next_ptr = NULL; \ + \ + for (; curr_ptr; curr_ptr = next_ptr) \ + { \ + next_ptr = curr_ptr->next_ptr; \ + if (curr_ptr->value == to_remove) break; \ + prev_ptr = curr_ptr; \ + } \ + \ + if (curr_ptr) \ + { \ + if (prev_ptr) prev_ptr->next_ptr = next_ptr; \ + queue->size--; \ + DLIF_free((void*)(curr_ptr)); \ + } \ + \ + if (!queue->size) \ + queue->front_ptr = queue->back_ptr = NULL; \ + else \ + { \ + if (!prev_ptr) queue->front_ptr = next_ptr; \ + if (!next_ptr) queue->back_ptr = prev_ptr; \ + } \ +} + + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/Stack.h b/src/core/dsp/ocl_load/DLOAD/Stack.h new file mode 100644 index 0000000..d36f5e0 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/Stack.h @@ -0,0 +1,155 @@ +/* +* Stack.h +* +* Interface to Stack +* ------------------ +* +* This is an implementation of a type-independent stack implemented as +* a signly linked list class for C. It's basically a template class, but +* uses macros instead, so that it can be compiled with a C-only compiler. +* +* To define a Stack class: +* #include "Stack.h" +* TYPE_STACK_DEFINITION(object_type,Class_Identifier) +* +* In a separate C file: +* #include "Stack.h" +* TYPE_STACK_DEFINITION(object_type,Class_Identifier) +* TYPE_STACK_IMPLEMENTATION(object_type,Class_Identifier) +* +* Now, to create a stack: +* struct Class_Identifier_Stack name; +* Get it initialized to zero everywhere somehow, maybe like this: +* initialize_stack_Class_Identifier(&name); +* +* To add to the stack: +* push_Class_Identifier(&name, object); +* +* To access the top of the stack: +* Class_Identifier_Stack_Node *tos = name.top_ptr; +* do_something_to_(tos->value); +* +* To delete from the stack: +* if (name.size > 0) pop_Class_Identifier(&name); +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef STACK_H +#define STACK_H + +#include <inttypes.h> +#include "dload_api.h" + +/*****************************************************************************/ +/* TYPE_STACK_DEFINITION() - Define structure specifications for a last-in, */ +/* first-out linked list of t_name objects. */ +/*****************************************************************************/ +#define TYPE_STACK_DEFINITION(t, t_name) \ +struct t_name##_Stack_Node_ \ +{ \ + t value; \ + struct t_name##_Stack_Node_* next_ptr; \ +}; \ +typedef struct t_name##_Stack_Node_ t_name##_Stack_Node; \ + \ +typedef struct \ +{ \ + t_name##_Stack_Node* top_ptr; \ + t_name##_Stack_Node* bottom_ptr; \ + int size; \ +} t_name##_Stack; \ + \ +extern void t_name##_initialize_stack(t_name##_Stack* stack); \ +extern void t_name##_push(t_name##_Stack* stack, t to_push); \ +extern t t_name##_pop(t_name##_Stack* stack); + +/*****************************************************************************/ +/* TYPE_STACK_DEFINITION() - Define the initializer to initalize Stacks. */ +/*****************************************************************************/ +#define TYPE_STACK_INITIALIZER {NULL, NULL, 0 } + +/*****************************************************************************/ +/* TYPE_STACK_IMPLEMENTATION() - Define member functions of new LIFO linked */ +/* list "class" of t_name objects. */ +/* */ +/* <type>_initialize_stack() - clears the stack */ +/* <type>_push() - pushes a <t> type object to the top of the stack */ +/* <type>_pop() - pop a <t> type object from the top of the stack */ +/* and provide access to it to the caller */ +/*****************************************************************************/ +#define TYPE_STACK_IMPLEMENTATION(t, t_name) \ +void t_name##_initialize_stack (t_name##_Stack* stack) \ +{ \ + stack->top_ptr = stack->bottom_ptr = NULL; \ + stack->size = 0; \ +} \ +void t_name##_push(t_name##_Stack* stack, t to_push) \ +{ \ + stack->size++; \ + \ + if(!stack->top_ptr) \ + { \ + stack->bottom_ptr = stack->top_ptr = \ + (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \ + stack->top_ptr->next_ptr = NULL; \ + } \ + else \ + { \ + t_name##_Stack_Node* next_ptr = stack->top_ptr; \ + stack->top_ptr = \ + (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \ + stack->top_ptr->next_ptr = next_ptr; \ + } \ + \ + stack->top_ptr->value = to_push; \ +} \ + \ +t t_name##_pop(t_name##_Stack* stack) \ +{ \ + t to_ret; \ + t_name##_Stack_Node* next_ptr = stack->top_ptr->next_ptr; \ + \ + stack->size--; \ + to_ret = stack->top_ptr->value; \ + DLIF_free((void*)(stack->top_ptr)); \ + \ + if(!stack->size) \ + stack->top_ptr = stack->bottom_ptr = NULL; \ + else \ + stack->top_ptr = next_ptr; \ + \ + return to_ret; \ +} + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/dload.c b/src/core/dsp/ocl_load/DLOAD/dload.c new file mode 100644 index 0000000..e5924d8 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/dload.c @@ -0,0 +1,3534 @@ +/* +* dload.c +* +* Core Dynamic Loader Reference Implementation +* +* This implementation of the core dynamic loader is platform independent, +* but it is object file format dependent. In particular, this +* implementation supports ELF object file format. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include <limits.h> +#include <inttypes.h> +#include <string.h> +#include <time.h> + +#include "ArrayList.h" +#include "Queue.h" +#include "Stack.h" + +#include "symtab.h" +#include "dload_endian.h" +#include "elf32.h" +#include "dload.h" +#include "relocate.h" +#include "dload_api.h" + +#ifdef ARM_TARGET +#include "arm_dynamic.h" +#endif + +#ifdef C60_TARGET +#include "c60_dynamic.h" +#endif + +#include "virtual_targets.h" + +/*---------------------------------------------------------------------------*/ +/* These globals are used only to test the reference client implementation. */ +/*---------------------------------------------------------------------------*/ +int global_argc; +char **global_argv; + +/*---------------------------------------------------------------------------*/ +/* Contains filenames (type const char*) the system is in the process of */ +/* loading. Used to detect cycles in incorrectly compiled ELF binaries. */ +/*---------------------------------------------------------------------------*/ +Array_List DLIMP_module_dependency_list; + +/*---------------------------------------------------------------------------*/ +/* Contains objects (type DLIMP_Loaded_Module) that the system has loaded into */ +/* target memory. */ +/*---------------------------------------------------------------------------*/ +TYPE_QUEUE_IMPLEMENTATION(DLIMP_Loaded_Module*, loaded_module_ptr) +loaded_module_ptr_Queue DLIMP_loaded_objects = TYPE_QUEUE_INITIALIZER; + +/*---------------------------------------------------------------------------*/ +/* Dependency Graph Queue - FIFO queue of dynamic modules that are loaded */ +/* when client asks to load a dynamic executable or library. Note that */ +/* dependents that have already been loaded with another module will not */ +/* appear on this queue. */ +/*---------------------------------------------------------------------------*/ +TYPE_STACK_IMPLEMENTATION(DLIMP_Dynamic_Module*, dynamic_module_ptr) +dynamic_module_ptr_Stack DLIMP_dependency_stack = TYPE_STACK_INITIALIZER; + +/*---------------------------------------------------------------------------*/ +/* Current virtual target set after reading the file headers. This is used */ +/* to access target specific functions. */ +/*---------------------------------------------------------------------------*/ +VIRTUAL_TARGET *cur_target = NULL; + +/*---------------------------------------------------------------------------*/ +/* Support for profiling performance of dynamic loader core. */ +/*---------------------------------------------------------------------------*/ +#if LOADER_DEBUG +static clock_t cycle0 = 0; +static clock_t cycle_end = 0; +#define profile_start_clock() (cycle0 = clock()) +#define profile_stop_clock() (cycle_end = clock()) +#define profile_cycle_count() (cycle_end - cycle0) +#endif + +/*---------------------------------------------------------------------------*/ +/* The dynamic loader will now create a table TI_init_table to store */ +/* pre-init and init data. This is done because pre-init and */ +/* init functions could reference as-yet unrelocated symbols from other */ +/* modules. As such it is safer to store relevant function addresses and */ +/* execute them only after all modules are relocated. */ +/*---------------------------------------------------------------------------*/ +TYPE_QUEUE_IMPLEMENTATION(IF_single_record*, IF_table) +IF_table_Queue TI_init_table = TYPE_QUEUE_INITIALIZER; + +static VIRTUAL_TARGET *get_vt_obj(int given_id); +static void read_args_from_section(DLIMP_Loaded_Module* ep_module); +static BOOL seg_has_space_for_write(DLIMP_Loaded_Module* lmodule, int sz); +static BOOL write_arguments_to_args_section(DLOAD_HANDLE handle, + int argc, char** argv, + DLIMP_Loaded_Module *ep_module); + +/*****************************************************************************/ +/* DLOAD_create() */ +/* */ +/* Create an instance of the dynamic loader core. */ +/* */ +/* client_handle: Private client token to be returned during select DLIF */ +/* function calls. */ +/* */ +/* returns: an opaque DLOAD core loader handle, identifying this instance.*/ +/* */ +/*****************************************************************************/ +DLOAD_HANDLE DLOAD_create(void *client_handle) +{ + LOADER_OBJECT *pLoaderObject = DLIF_malloc(sizeof(LOADER_OBJECT)); + + /*-----------------------------------------------------------------------*/ + /* Fill out the Loader Object: */ + /*-----------------------------------------------------------------------*/ + /* Set up initial objects_loading queue. */ + /*-----------------------------------------------------------------------*/ + AL_initialize(&(pLoaderObject->DLIMP_module_dependency_list), + sizeof (const char*), 1); + + /*-----------------------------------------------------------------------*/ + /* Initialize Loaded Module Ptr Queue */ + /*-----------------------------------------------------------------------*/ + loaded_module_ptr_initialize_queue(&pLoaderObject->DLIMP_loaded_objects); + + /*-----------------------------------------------------------------------*/ + /* Initialize Dynamic Module Ptr Stack */ + /*-----------------------------------------------------------------------*/ + dynamic_module_ptr_initialize_stack(&pLoaderObject->DLIMP_dependency_stack); + + pLoaderObject->file_handle = 1; + + /*-----------------------------------------------------------------------*/ + /* Store client token, so it can be handed back during DLIF calls */ + /*-----------------------------------------------------------------------*/ + pLoaderObject->client_handle = client_handle; + + return((DLOAD_HANDLE)pLoaderObject); +} + +/*****************************************************************************/ +/* DLOAD_destroy() */ +/* */ +/* Remove an instance of the dynamic loader core, and free all resources */ +/* allocated during DLOAD_create(). */ +/* */ +/* client_handle: Private client token to be returned during select DLIF */ +/* function calls. */ +/* Preconditions: 1) handle must be valid. */ +/* 2) Loader instance must be in "UNLOADED" state. */ +/* */ +/*****************************************************************************/ +void DLOAD_destroy(DLOAD_HANDLE handle) +{ + LOADER_OBJECT * pLoaderObject; + + pLoaderObject = (LOADER_OBJECT *)handle; + AL_destroy(&(pLoaderObject->DLIMP_module_dependency_list)); + + /*--------------------------*/ + /* Free the instance object */ + /*--------------------------*/ + DLIF_free (pLoaderObject); +} + +/*****************************************************************************/ +/* DLIMP_get_first_dyntag() */ +/* */ +/* Return value for first tag entry in the given dynamic table whose */ +/* tag type matches the given key. */ +/* */ +/*****************************************************************************/ +uint32_t DLIMP_get_first_dyntag(int tag, struct Elf32_Dyn* dyn_table) +{ + /*------------------------------------------------------------------------*/ + /* Spin through dynamic segment looking for a specific dynamic tag. */ + /* Return the value associated with the tag, if the tag is found. */ + /*------------------------------------------------------------------------*/ + struct Elf32_Dyn *dtp = dyn_table; + + while (dtp->d_tag != DT_NULL) + { + if (dtp->d_tag == tag) return dtp->d_un.d_val; + else dtp++; + } + + /*------------------------------------------------------------------------*/ + /* Tag wasn't found, return a known bogus value for the tag. */ + /*------------------------------------------------------------------------*/ + return INT_MAX; +} + +/*****************************************************************************/ +/* dload_and_allocate_dependencies() */ +/* */ +/* If not already loaded, load each dependent file identified in the */ +/* dynamic segment with a DT_NEEDED tag. Dependent files are listed in */ +/* order and should be loaded in the same order that they appear in the */ +/* dynamic segment. */ +/* */ +/*****************************************************************************/ +static BOOL dload_and_allocate_dependencies( DLOAD_HANDLE handle, + DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* Spin through each dynamic tag entry in the dynamic segment. */ + /*------------------------------------------------------------------------*/ + struct Elf32_Dyn* dyn_nugget = dyn_module->dyntab; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Starting dload_and_allocate_dependencies() for %s ...\n", + dyn_module->name); +#endif + + while(dyn_nugget->d_tag != DT_NULL) + { + /*---------------------------------------------------------------------*/ + /* For each DT_NEEDED dynamic tag that we find in the dynamic segment, */ + /* load the dependent file identified by the so_name value attached */ + /* to the DT_NEEDED dynamic tag. */ + /*---------------------------------------------------------------------*/ + if (dyn_nugget->d_tag == DT_NEEDED) + { + loaded_module_ptr_Queue_Node* ptr; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Found DT_NEEDED: %s\n", + dyn_module->strtab+dyn_nugget->d_un.d_val); +#endif + + /*------------------------------------------------------------------*/ + /* Find out if the file named by the DT_NEEDED tag has already */ + /* been loaded. If it has, then we only have to bump the use count */ + /* of the named dependent file. */ + /*------------------------------------------------------------------*/ + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + { + + + if (!strcmp(ptr->value->name, + dyn_module->strtab + dyn_nugget->d_un.d_val)) + { + ptr->value->use_count++; + AL_append(&(dyn_module->loaded_module->dependencies), + &(ptr->value->file_handle)); + break; + } + } + + /*------------------------------------------------------------------*/ + /* If the named dependent file has not been loaded, then we ask the */ + /* client to invoke a load of the dependent file on our behalf. */ + /*------------------------------------------------------------------*/ + if (ptr == NULL) + { + int32_t dependent_handle = DLIF_load_dependent( + pHandle->client_handle, + dyn_module->strtab + + dyn_nugget->d_un.d_val); + AL_append(&(dyn_module->loaded_module->dependencies), + &dependent_handle); + if (dependent_handle == 0) return FALSE; + } + } + + dyn_nugget++; + } + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Finished dload_and_allocate_dependencies() for %s\n", + dyn_module->name); +#endif + + return TRUE; +} + +/*****************************************************************************/ +/* load_object() */ +/* */ +/* Finish the process of loading an object file. */ +/* */ +/*****************************************************************************/ +static int load_object(LOADER_FILE_DESC *fd, DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* With the dynamic loader already running on the target, we are able to */ + /* relocate directly into target memory, so there is nothing more to be */ + /* done (at least in the bare-metal dynamic linking ABI model). */ + /*------------------------------------------------------------------------*/ + return 1; +} + +/*****************************************************************************/ +/* write_arguments_to_args_section() */ +/* */ +/* Write argv and argc to .args section. */ +/* */ +/*****************************************************************************/ +static BOOL write_arguments_to_args_section(DLOAD_HANDLE handle, + int argc, char** argv, + DLIMP_Loaded_Module *ep_module) +{ + int mem_inc = MEM_INC; + int ptr_sz = PTR_SZ; + int p_size = ptr_sz / mem_inc; + int i_size = T_INTSZ / mem_inc; + int c_size = T_CHARSZ /mem_inc; + int argv_offset = 0; + int str_offset = 0; + int size = 0; + int arg; + int *targ_argv_pointers = NULL; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + uint8_t *c_args = NULL; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Write_arguments_to_args_section:\n"); +#endif + + /*-----------------------------------------------------------------------*/ + /* IF NO ARGUMENTS, ABORT QUIETLY, WITH a SUCCESSFUL CODE. */ + /*-----------------------------------------------------------------------*/ + if (argc == 0) return TRUE; + + /*-----------------------------------------------------------------------*/ + /* __c_args__ points to the beginning of the .args section, if there */ + /* is one. This is stored in the Loaded Module, and must have a */ + /* legitimate address. If not, abort with Warning. */ + /*-----------------------------------------------------------------------*/ + c_args = ep_module->c_args; + if (!c_args || c_args == (uint8_t *)0xFFFFFFFF) + { + DLIF_warning(DLWT_MISC, "__c_args__ does not have valid value.\n"); + return FALSE; + } + + /*-----------------------------------------------------------------------*/ + /* WE OUGHT TO WORRY ABOUT ALIGNMENT: IF SECTION ISN'T PROPERLY ALIGNED, */ + /* ABORT THE PROCESSING OF ARGUMENTS WITH A NICE ERROR MESSAGE. */ + /*-----------------------------------------------------------------------*/ + if (c_args && ((Elf32_Addr)c_args & (MAX(p_size, i_size) - 1))) + { + DLIF_warning(DLWT_MISC, ".args section not properly aligned\n"); + return FALSE; + } + + /*-----------------------------------------------------------------------*/ + /* CALCULATE OFFSET IN TABLE WHERE ARGV AND THE STRINGS WILL BE STORED. */ + /* NOTE THAT argv MAY NEED MORE ALIGNMENT THAN AN INTEGER, SO ITS OFFSET */ + /* IS REALLY THE MAXIMUM OF A POINTER SIZE AND INTEGER SIZE. ALSO NOTE */ + /* WE NEED TO ALLOCATE AN EXTRA POINTER FOR argv[argc]. */ + /*-----------------------------------------------------------------------*/ + argv_offset = MAX(p_size, i_size); + str_offset = argv_offset + (argc * p_size) + p_size ; + + /*-----------------------------------------------------------------------*/ + /* CALCULATE SPACE REQUIRED FOR WRITING OUT .args SECTION. CHECK IF THE */ + /* SEGMENT HAS ENOUGH SPACE AVAILABLE. IF NOT, RETURN WITH ERROR CODE. */ + /*-----------------------------------------------------------------------*/ + size = str_offset; + + for (arg = 0; arg < argc; arg++) + size += (c_size * (strlen(argv[arg]) + 1)); + + if (!seg_has_space_for_write(ep_module, size)) + { + DLIF_warning(DLWT_MISC, + "Segment has insufficient space for .args contents\n"); + return FALSE; + } + + /*-----------------------------------------------------------------------*/ + /* OVERALL, WE NEED TO CREATE A TARGET IMAGE THAT CORRESPONDS TO: */ + /* int argc; */ + /* char *argv[argc]; */ + /* <strings pointed to by argv> */ + /* So say, for C6x, for "-v -d", we would need 22 bytes: */ + /* 4 bytes // argc */ + /* 4 bytes // argv[0] pointer value */ + /* 4 bytes // argv[1] pointer value */ + /* 4 bytes // argv[argc] end of pointer value array, normally 0 */ + /* 3 bytes // "-v" */ + /* 3 bytes // "-d" */ + /*-----------------------------------------------------------------------*/ + + /*-----------------------------------------------------------------------*/ + /* FIRST WRITE OUT ARGC. */ + /*-----------------------------------------------------------------------*/ +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n", + i_size, (uint32_t) &argc, (uint32_t) c_args); +#endif + + DLIF_memcpy(pHandle->client_handle, c_args, &argc, i_size); + + /*-----------------------------------------------------------------------*/ + /* CREATE AN INTERNAL ARRAY OF ARGV POINTER VALUES, THEN WRITE THEM OUT */ + /*-----------------------------------------------------------------------*/ + targ_argv_pointers = (int *)DLIF_malloc((argc + 1) * sizeof(int)); + for (arg = 0; arg < argc ; arg++) + { + targ_argv_pointers[arg] = (int)(str_offset + c_args); + str_offset += (strlen(argv[arg]) + 1) * c_size; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace ("\t\ttarg_argv_pointers[%d] : 0x%x\n", + arg, targ_argv_pointers[arg]); +#endif + } + + targ_argv_pointers[argc] = 0; + + /*-----------------------------------------------------------------------*/ + /* WRITE OUT THIS INTERNAL ARRAY OF ARGV POINTER VALUES */ + /*-----------------------------------------------------------------------*/ + for (arg = 0; arg <= argc; arg++) + { +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n", + p_size, (uint32_t) &targ_argv_pointers[arg], + (uint32_t) (c_args + argv_offset)); +#endif + DLIF_memcpy(pHandle->client_handle, + (void *)(c_args + argv_offset), + &targ_argv_pointers[arg], + p_size); + argv_offset += p_size; + } + +#if LOADER_DEBUG +if (debugging_on) +{ + DLIF_trace ("\t\targv being copied : 0x%x\n",(uint32_t)argv); + for (arg = 0; arg < argc; arg++) + { + DLIF_trace ("\t\t---\n\t\t&argv[%d] being copied : 0x%x\n", arg, + (uint32_t)&argv[arg]); + DLIF_trace ("\t\targv[%d] being copied : 0x%x\n",arg, + (uint32_t)argv[arg]); + DLIF_trace ("\t\targv[%d] being copied : %s\n",arg, (char *)argv[arg]); + } +} +#endif + + /*-----------------------------------------------------------------------*/ + /* LASTLY WRITE OUT ALL THE STRINGS. */ + /*-----------------------------------------------------------------------*/ + for (arg = 0; arg < argc; arg++) + { +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n", + (uint32_t)strlen(argv[arg]) + 1, + (uint32_t)&argv[arg], + (uint32_t)(targ_argv_pointers[arg])); +#endif + DLIF_memcpy(pHandle->client_handle, + (void *)(targ_argv_pointers[arg]), + argv[arg], + strlen(argv[arg]) + 1); + } + + return TRUE; +} + + +/*****************************************************************************/ +/* initialize_loaded_module() */ +/* */ +/* Initialize DLIMP_Loaded_Module internal data object associated with a */ +/* dynamic module. This function will also set up a queue of */ +/* DLIMP_Loaded_Segment(s) associated with the loaded module. */ +/* This function is called as we are getting ready to actually load the */ +/* object file contents into target memory. Each segment will get a */ +/* target memory request that it can use to ask the client for target */ +/* memory space. This function will also assign a file handle to the */ +/* loaded module. */ +/* */ +/*---------------------------------------------------------------------------*/ +/* */ +/* In applications that use the DSBT model, this function will also need to */ +/* negotiate the module's DSBT index with the client. */ +/* */ +/*****************************************************************************/ +static void initialize_loaded_module(DLOAD_HANDLE handle, + DLIMP_Dynamic_Module *dyn_module) +{ + int i; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + /*------------------------------------------------------------------------*/ + /* Allocate a DLIMP_Loaded_Module data structure for the specified ELF */ + /* file and assign a file handle for it (bumping the file handle counter */ + /* as we go). */ + /*------------------------------------------------------------------------*/ + DLIMP_Loaded_Module *loaded_module = + dyn_module->loaded_module = DLIF_malloc(sizeof(DLIMP_Loaded_Module)); + +#if LOADER_DEBUG || LOADER_PROFILE + /*------------------------------------------------------------------------*/ + /* Start clock on initialization of loaded module object. */ + /*------------------------------------------------------------------------*/ + if (debugging_on || profiling_on) + { + DLIF_trace("Starting initialize_loaded_module() ...\n"); + if (profiling_on) profile_start_clock(); + } +#endif + + if (dyn_module->name) + { + loaded_module->name = DLIF_malloc(strlen(dyn_module->name) + 1); + strcpy(loaded_module->name, dyn_module->name); + } + else + loaded_module->name = "<unknown>"; + + loaded_module->file_handle = pHandle->file_handle++; + loaded_module->direct_dependent_only = dyn_module->direct_dependent_only; + loaded_module->use_count = 1; + + /*------------------------------------------------------------------------*/ + /* In case we wrapped around the file handle, return error. */ + /*------------------------------------------------------------------------*/ + if (pHandle->file_handle == 0) + DLIF_error(DLET_MISC, "DLOAD File handle overflowed.\n"); + + /*------------------------------------------------------------------------*/ + /* Initially the loaded module does not have access to its global */ + /* symbols. These need to be copied from the dynamic module (see call */ + /* to DLSYM_copy_globals() below). */ + /* */ + /* THESE INITIALIZATIONS SHOULD BE MOVED TO AN INIT ROUTINE FOR THE */ + /* LOADED MODULE */ + /*------------------------------------------------------------------------*/ + loaded_module->gsymtab = NULL; + loaded_module->gstrtab = NULL; + loaded_module->gsymnum = loaded_module->gstrsz = 0; + + /*------------------------------------------------------------------------*/ + /* Initialize the Array_List of dependencies. */ + /*------------------------------------------------------------------------*/ + AL_initialize(&(loaded_module->dependencies), sizeof(int), 1); + + if (dyn_module->symtab) + DLSYM_copy_globals(dyn_module); + + /*------------------------------------------------------------------------*/ + /* Initialize the module loaded segments Array_List. */ + /*------------------------------------------------------------------------*/ + AL_initialize(&(loaded_module->loaded_segments), + sizeof(DLIMP_Loaded_Segment), dyn_module->phnum); + + /*------------------------------------------------------------------------*/ + /* Spin thru segment headers and process each load segment encountered. */ + /*------------------------------------------------------------------------*/ + for (i = 0; i < dyn_module->phnum; i++) + if (dyn_module->phdr[i].p_type == PT_LOAD) + { + /*------------------------------------------------------------------*/ + /* Note that this is parallel to and does not supplant the ELF */ + /* phdr tables. */ + /*------------------------------------------------------------------*/ + DLIMP_Loaded_Segment seg; + seg.obj_desc = DLIF_malloc(sizeof(struct DLOAD_MEMORY_SEGMENT)); + seg.phdr.p_vaddr = dyn_module->phdr[i].p_vaddr; + seg.phdr.p_offset = dyn_module->phdr[i].p_offset; + seg.obj_desc->target_page = 0; /*not used*/ + seg.modified = 0; + seg.phdr.p_filesz = seg.obj_desc->objsz_in_bytes + = dyn_module->phdr[i].p_filesz; + seg.phdr.p_memsz = seg.obj_desc->memsz_in_bytes + = dyn_module->phdr[i].p_memsz; + seg.phdr.p_align = dyn_module->phdr[i].p_align; + seg.phdr.p_flags = dyn_module->phdr[i].p_flags; + AL_append(&(loaded_module->loaded_segments), &seg); + } + + /*------------------------------------------------------------------------*/ + /* Initialize the DSO termination information for this module. */ + /* It will be copied over from the enclosing dyn_module object when */ + /* placement is completed and dyn_module's local copy of the dynamic */ + /* table is updated. */ + /*------------------------------------------------------------------------*/ + loaded_module->fini_array = (Elf32_Addr) NULL; + loaded_module->fini_arraysz = 0; + loaded_module->fini = (Elf32_Addr) NULL; + +#if LOADER_DEBUG || LOADER_PROFILE + if (debugging_on || profiling_on) + { + DLIF_trace("Finished initialize_loaded_module()\n"); + if (profiling_on) + { + profile_stop_clock(); + DLIF_trace("Took %lu cycles.\n", + (unsigned long)profile_cycle_count()); + } + } +#endif + +} + +/*****************************************************************************/ +/* load_static_segment() */ +/* */ +/* The core dynamic loader requires that a statically linked executable */ +/* be placed in target memory at the location that was determined during */ +/* the static link that created the executable. Failure to get the */ +/* required target memory where the static executable is to be loaded */ +/* will cause the dynamic loader to emit an error and abort the load. */ +/* */ +/*****************************************************************************/ +static BOOL load_static_segment(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + int i; + DLIMP_Loaded_Segment* seg = (DLIMP_Loaded_Segment*) + (dyn_module->loaded_module->loaded_segments.buf); + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + /*------------------------------------------------------------------------*/ + /* For each segment in the loaded module, build up a target memory */ + /* request for the segment, get rights to target memory where we want */ + /* to load the segment from the client, then get the client to write the */ + /* segment contents out to target memory to the appropriate address. */ + /*------------------------------------------------------------------------*/ + for (i = 0; i < dyn_module->loaded_module->loaded_segments.size; i++) + { + struct DLOAD_MEMORY_REQUEST targ_req; + seg[i].obj_desc->target_page = 0; + targ_req.flags = 0; + + /*---------------------------------------------------------------------*/ + /* This is a static executable. DLIF_allocate should give us the */ + /* address we ask for or fail. */ + /*---------------------------------------------------------------------*/ + if (seg[i].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable; + if (seg[i].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable; + + + targ_req.align = seg[i].phdr.p_align; + seg[i].obj_desc->target_address = (TARGET_ADDRESS)seg[i].phdr.p_vaddr; + targ_req.flags &= ~DLOAD_SF_relocatable; + targ_req.fp = fd; + targ_req.segment = seg[i].obj_desc; + targ_req.offset = seg[i].phdr.p_offset; + targ_req.flip_endian = dyn_module->wrong_endian; + + /*---------------------------------------------------------------------*/ + /* Ask the client side of the dynamic loader to allocate target memory */ + /* for this segment to be loaded into. */ + /*---------------------------------------------------------------------*/ + if (!DLIF_allocate(pHandle->client_handle, &targ_req)) return FALSE; + + /*---------------------------------------------------------------------*/ + /* If there is any initialized data in the segment, we'll first write */ + /* it into a host writable buffer (DLIF_copy()) and then flush it to */ + /* target memory. */ + /*---------------------------------------------------------------------*/ + if (seg[i].phdr.p_filesz) + { + DLIF_copy(pHandle->client_handle, &targ_req); + DLIF_write(pHandle->client_handle, &targ_req); + } + } + + return TRUE; +} + +/*****************************************************************************/ +/* relocate_target_dynamic_tag_info() */ +/* */ +/* Update a target specific dynamic tag value that happens to be a */ +/* virtual address of a section. Returns TRUE if the tag was updated or */ +/* is not a virtual address and FALSE if it was not successfully updated */ +/* or was not recognized. */ +/*****************************************************************************/ +static BOOL relocate_target_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module, + int i) +{ + return cur_target->relocate_dynamic_tag_info(dyn_module, i); +} + +/*****************************************************************************/ +/* DLIMP_update_dyntag_section_address() */ +/* */ +/* Given the index of a dynamic tag which we happen to know points to a */ +/* section address, find the program header table entry associated with */ +/* the specified address and update the tag value with the real address */ +/* of the section. */ +/* */ +/*****************************************************************************/ +BOOL DLIMP_update_dyntag_section_address(DLIMP_Dynamic_Module *dyn_module, + int32_t i) +{ + int j; + DLIMP_Loaded_Segment *seg = (DLIMP_Loaded_Segment *) + (dyn_module->loaded_module->loaded_segments.buf); + + /*------------------------------------------------------------------------*/ + /* If dynamic tag does not access an existing section, then no update */ + /* is required. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->dyntab[i].d_un.d_ptr == (Elf32_Addr)0) + { return TRUE; } + + for (j = 0; j < dyn_module->loaded_module->loaded_segments.size; j++) + { + if ((dyn_module->dyntab[i].d_un.d_ptr >= seg[j].input_vaddr) && + (dyn_module->dyntab[i].d_un.d_ptr < + (seg[j].input_vaddr + seg[j].phdr.p_memsz))) + { + dyn_module->dyntab[i].d_un.d_ptr += + (seg[j].phdr.p_vaddr - seg[j].input_vaddr); + return TRUE; + } + } + + return FALSE; +} + +/*****************************************************************************/ +/* relocate_dynamic_tag_info() */ +/* */ +/* Once segment allocation has been completed, we'll need to go through */ +/* the dynamic table and update any tag values that happen to be virtual */ +/* addresses of segments (DT_C6000_DSBT_BASE, for example). */ +/* */ +/*****************************************************************************/ +static BOOL relocate_dynamic_tag_info(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* Spin through dynamic table loking for tags that have a value which is */ + /* the virtual address of a section. After the sections are allocated, */ + /* we'll need to update these values with the new address of the section. */ + /*------------------------------------------------------------------------*/ + int i; + for (i = 0; dyn_module->dyntab[i].d_tag != DT_NULL; i++) + { + switch (dyn_module->dyntab[i].d_tag) + { + /*------------------------------------------------------------------*/ + /* Only tag values that are virtual addresses will be affected. */ + /*------------------------------------------------------------------*/ + case DT_NEEDED: + case DT_PLTRELSZ: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_RELASZ: + case DT_RELAENT: + case DT_STRSZ: + case DT_SYMENT: + case DT_SONAME: + case DT_RPATH: + case DT_SYMBOLIC: + case DT_REL: + case DT_RELSZ: + case DT_RELENT: + case DT_PLTREL: + case DT_DEBUG: + case DT_TEXTREL: + case DT_BIND_NOW: + case DT_INIT_ARRAYSZ: + case DT_RUNPATH: + case DT_FLAGS: + case DT_PREINIT_ARRAYSZ: + continue; + + /*------------------------------------------------------------------*/ + /* NOTE!!! */ + /* case DT_ENCODING: -- tag type has same "id" as DT_PREINIT_ARRAY */ + /*------------------------------------------------------------------*/ + + /*------------------------------------------------------------------*/ + /* This is a generic dynamic tag whose value is a virtual address */ + /* of a section. It needs to be relocated to the section's actual */ + /* address in target memory. */ + /*------------------------------------------------------------------*/ + case DT_PREINIT_ARRAY: + case DT_INIT: + case DT_INIT_ARRAY: + if (!DLIMP_update_dyntag_section_address(dyn_module, i)) + return FALSE; + + continue; + + /*------------------------------------------------------------------*/ + /* Once we have resolved the actual address of termination function */ + /* sections, we need to copy their addresses over to the loaded */ + /* module object (dyn_module will be deleted before we get to */ + /* unloading the module). */ + /*------------------------------------------------------------------*/ + case DT_FINI_ARRAY: + case DT_FINI: + if (!DLIMP_update_dyntag_section_address(dyn_module, i)) + return FALSE; + + if (dyn_module->dyntab[i].d_tag == DT_FINI) + dyn_module->loaded_module->fini = + dyn_module->dyntab[i].d_un.d_ptr; + else + dyn_module->loaded_module->fini_array = + dyn_module->dyntab[i].d_un.d_ptr; + + continue; + + case DT_FINI_ARRAYSZ: + dyn_module->loaded_module->fini_arraysz = + dyn_module->dyntab[i].d_un.d_val; + continue; + + /*------------------------------------------------------------------*/ + /* Is this a virtual address??? */ + /*------------------------------------------------------------------*/ + case DT_JMPREL: /* is this a virtual address??? */ + continue; + + /*------------------------------------------------------------------*/ + /* The remaining dynamic tag types should be target specific. If */ + /* something generic slips through to here, then the handler for */ + /* relocating target specific dynamic tags should fail. */ + /*------------------------------------------------------------------*/ + default: + if (!relocate_target_dynamic_tag_info(dyn_module, i)) + return FALSE; + } + } + + /*------------------------------------------------------------------------*/ + /* We've gotten through all of the dynamic table without incident. */ + /* All dynamic tag values that were virtual section addresses should have */ + /* been updated with the final address of the section that they point to. */ + /*------------------------------------------------------------------------*/ + return TRUE; +} + +/*****************************************************************************/ +/* allocate_dynamic_segments_and relocate_symbols() */ +/* */ +/* Allocate target memory for each segment in this module, getting a */ +/* host-accessible space to copy the content of each segment into. Then */ +/* update the symbol table and program header table to reflect the new */ +/* target address for each segment. Processing of the dynamic relocation */ +/* entries will wait until all dependent files have been loaded and */ +/* allocated into target memory. */ +/* */ +/*---------------------------------------------------------------------------*/ +/* */ +/* The relocation entries in the ELF file do not handle the necessary */ +/* adjustments to the memory addresses in the program header or symbol */ +/* tables. These must be done manually. */ +/* */ +/* This is harder for us than for most dynamic loaders, because we have to */ +/* work in environments without virtual memory and thus where the offsets */ +/* between segments in memory may be different than they were in the file. */ +/* So, even though a dynamic loader usually only has to adjust all the */ +/* segments by a single fixed offset, we need to offset the symbols and */ +/* program header addresses segment by segment. This job is done by the */ +/* function below. */ +/* */ +/*****************************************************************************/ +static BOOL allocate_dynamic_segments_and_relocate_symbols + (DLOAD_HANDLE handle, + LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + int i,j; + DLIMP_Loaded_Segment* seg = (DLIMP_Loaded_Segment*) + (dyn_module->loaded_module->loaded_segments.buf); + struct Elf32_Ehdr *fhdr = &(dyn_module->fhdr); + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + +#if LOADER_DEBUG || LOADER_PROFILE + if (debugging_on || profiling_on) + { + DLIF_trace("Dynamic executable found.\n" + "Starting allocate_dynamic_segments_and_relocate_symbols()" + "...\n"); + if (profiling_on) profile_start_clock(); + } +#endif + + /*------------------------------------------------------------------------*/ + /* Spin through the list of loaded segments from the current module. */ + /*------------------------------------------------------------------------*/ + for (i = 0; i < dyn_module->loaded_module->loaded_segments.size; i++) + { + /*--------------------------------------------------------------------*/ + /* Allocate target memory for segment via client-provided target */ + /* memory API. */ + /*--------------------------------------------------------------------*/ + int32_t addr_offset; + struct DLOAD_MEMORY_REQUEST targ_req; + seg[i].obj_desc->target_page = 0; + targ_req.flags = 0; + if (seg[i].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable; + if (seg[i].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable; + + targ_req.align = 0x20; + seg[i].obj_desc->target_address = (TARGET_ADDRESS)seg[i].phdr.p_vaddr; + targ_req.flags |= DLOAD_SF_relocatable; + targ_req.fp = fd; + targ_req.segment = seg[i].obj_desc; + targ_req.offset = seg[i].phdr.p_offset; + targ_req.flip_endian = dyn_module->wrong_endian; + + if (!DLIF_allocate(pHandle->client_handle, &targ_req)) + { + DLIF_error(DLET_MEMORY, "DLIF allocation failure.\n"); + return FALSE; + } + + /*--------------------------------------------------------------------*/ + /* Calculate the offset we need to adjust segment header and symbol */ + /* table addresses. */ + /*--------------------------------------------------------------------*/ + addr_offset = (int32_t)(seg[i].obj_desc->target_address) - + (int32_t)(seg[i].phdr.p_vaddr); + +#if LOADER_DEBUG + if (debugging_on) + { + DLIF_trace("Segment %d (at 0x%x, 0x%x bytes) relocated to 0x%x\n", i, + (int32_t)(seg[i].phdr.p_vaddr), + (int32_t)(seg[i].phdr.p_memsz), + (int32_t)(seg[i].obj_desc->target_address)); + DLIF_trace("Addr Offset is 0x%x\n", addr_offset); + } +#endif + + /*--------------------------------------------------------------------*/ + /* Update program entry point if needed. Need to replace to deal */ + /* with full ELF initialization routine. */ + /*--------------------------------------------------------------------*/ + if (dyn_module->relocate_entry_point && + fhdr->e_entry >= (Elf32_Addr)(seg[i].phdr.p_vaddr) && + fhdr->e_entry < + (Elf32_Addr)((uint8_t*)(seg[i].phdr.p_vaddr) + + (uint32_t)(seg[i].phdr.p_memsz))) + { +#if LOADER_DEBUG + if (debugging_on) + { + DLIF_trace("Entry point 0x%x relocated to 0x%x\n", + fhdr->e_entry, fhdr->e_entry + addr_offset); + } +#endif + fhdr->e_entry += addr_offset; + + /*------------------------------------------------------------------*/ + /* Mark the entry point as being relocated so we will not do it */ + /* again. */ + /*------------------------------------------------------------------*/ + dyn_module->relocate_entry_point = FALSE; + } + + /*---------------------------------------------------------------------*/ + /* Fix program header entries in segment and Elf32_Phdr structs. */ + /*---------------------------------------------------------------------*/ + for (j = 0; j < fhdr->e_phnum; j++) + if (dyn_module->phdr[j].p_vaddr == (Elf32_Addr)seg[i].phdr.p_vaddr) + { + dyn_module->phdr[j].p_vaddr += addr_offset; + dyn_module->phdr[i].p_paddr += addr_offset; + break; + } + + seg[i].input_vaddr = (Elf32_Addr)(seg[i].phdr.p_vaddr); + seg[i].phdr.p_vaddr += addr_offset; + + /*---------------------------------------------------------------------*/ + /* Great, now the hard part: fix offsets in symbols. It would be nice */ + /* if there were an easier way to deal with this. */ + /*---------------------------------------------------------------------*/ + { + struct Elf32_Sym *gsymtab = + ((struct Elf32_Sym*)(dyn_module->loaded_module->gsymtab)); + Elf32_Addr segment_start = (Elf32_Addr)seg[i].phdr.p_vaddr; + Elf32_Addr segment_end = (Elf32_Addr)seg[i].phdr.p_vaddr + + seg[i].phdr.p_memsz; + Elf32_Word global_index = dyn_module->symnum - + dyn_module->loaded_module->gsymnum; + + for (j = 0; j < dyn_module->symnum; j++) + { + /*---------------------------------------------------------------*/ + /* Get the relocated symbol value. */ + /*---------------------------------------------------------------*/ + Elf32_Addr symval_adj = dyn_module->symtab[j].st_value + + addr_offset; + + /*---------------------------------------------------------------*/ + /* If the symbol is defined in this segment, update the symbol */ + /* value and mark the symbol so that we don't relocate it again. */ + /*---------------------------------------------------------------*/ + if (symval_adj >= segment_start && symval_adj < segment_end && + dyn_module->symtab[j].st_shndx != INT16_MAX) + { + dyn_module->symtab[j].st_value = symval_adj; + + /*------------------------------------------------------------*/ + /* The module symbol table only has the global symbols. */ + /*------------------------------------------------------------*/ + if (j >= global_index) + gsymtab[j-global_index].st_value = symval_adj; + + /*------------------------------------------------------------*/ + /* Mark the symbol as relocated. */ + /*------------------------------------------------------------*/ + dyn_module->symtab[j].st_shndx = INT16_MAX; + } + } + } + } + + /*------------------------------------------------------------------------*/ + /* Update dynamic tag information. Some dynamic tags have values which */ + /* are virtual addresses of sections. These values need to be updated */ + /* once segment allocation is completed and the new segment addresses are */ + /* known. */ + /*------------------------------------------------------------------------*/ + /* We should only traverse through the dynamic table once because we want */ + /* to avoid the possibility of updating the same tag multiple times (an */ + /* error, if it happens). */ + /*------------------------------------------------------------------------*/ + if (!relocate_dynamic_tag_info(fd, dyn_module)) + { + DLIF_error(DLET_MISC, "Failed dynamic table update.\n"); + return FALSE; + } + +#if LOADER_DEBUG || LOADER_PROFILE + if (debugging_on || profiling_on) + { + DLIF_trace("Finished allocate_dynamic_segments_and_relocate_symbols()\n"); + if (profiling_on) + { + profile_stop_clock(); + DLIF_trace("Took %lu cycles.\n", (unsigned long) profile_cycle_count()); + } + } +#endif + + return TRUE; +} + +/*****************************************************************************/ +/* delete_DLIMP_Loaded_Module() */ +/* */ +/* Free host memory associated with a DLIMP_Loaded_Module data structure */ +/* and all of the DLIMP_Loaded_Segment objects that are associated with */ +/* it. */ +/* */ +/*****************************************************************************/ +static void delete_DLIMP_Loaded_Module(DLOAD_HANDLE handle, + DLIMP_Loaded_Module **pplm) +{ + DLIMP_Loaded_Module *loaded_module = *pplm; + DLIMP_Loaded_Segment *segments = (DLIMP_Loaded_Segment*) + (loaded_module->loaded_segments.buf); + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + /*-----------------------------------------------------------------------*/ + /* Spin through the segments attached to this loaded module, freeing up */ + /* any target memory that was allocated by the client for the segment. */ + /*-----------------------------------------------------------------------*/ + int i; + for (i = 0; i < loaded_module->loaded_segments.size; i++) + { + if (!DLIF_release(pHandle->client_handle, segments[i].obj_desc)) + DLIF_error(DLET_MISC, "Failed call to DLIF_release!\n");; + DLIF_free(segments[i].obj_desc); + } + + /*----------------------------------------------------------------------*/ + /* Hacky way of indicating that the base image is no longer available. */ + /* WHHHHAAAAAAATTT!?!?!?!?!?! */ + /*----------------------------------------------------------------------*/ + if (loaded_module->file_handle == DLIMP_application_handle) + DLIMP_application_handle = 0; + + /*-----------------------------------------------------------------------*/ + /* Free host heap memory that was allocated for the internal loaded */ + /* module data structure members. */ + /*-----------------------------------------------------------------------*/ + if (loaded_module->name) DLIF_free(loaded_module->name); + if (loaded_module->gsymtab) DLIF_free(loaded_module->gsymtab); + loaded_module->gsymnum = 0; + if (loaded_module->gstrtab) DLIF_free(loaded_module->gstrtab); + loaded_module->gstrsz = 0; + AL_destroy(&(loaded_module->loaded_segments)); + AL_destroy(&(loaded_module->dependencies)); + + /*-----------------------------------------------------------------------*/ + /* Finally, free the host memory for the loaded module object, then NULL */ + /* the pointer that was passed in. */ + /*-----------------------------------------------------------------------*/ + DLIF_free(loaded_module); + *pplm = NULL; +} + +/*****************************************************************************/ +/* new_DLIMP_Dynamic_Module() */ +/* */ +/* Allocate a dynamic module data structure from host memory and */ +/* initialize its members to their default values. */ +/* */ +/*****************************************************************************/ +static DLIMP_Dynamic_Module *new_DLIMP_Dynamic_Module(LOADER_FILE_DESC *fd) +{ + /*-----------------------------------------------------------------------*/ + /* Allocate space for dynamic module data structure from host memory. */ + /*-----------------------------------------------------------------------*/ + DLIMP_Dynamic_Module *dyn_module = + (DLIMP_Dynamic_Module *)DLIF_malloc(sizeof(DLIMP_Dynamic_Module)); + + /*-----------------------------------------------------------------------*/ + /* Initialize data members of the new dynamic module data structure. */ + /*-----------------------------------------------------------------------*/ + dyn_module->name = NULL; + dyn_module->fd = fd; + dyn_module->phdr = NULL; + dyn_module->phnum = 0; + dyn_module->strtab = NULL; + dyn_module->strsz = 0; + dyn_module->dyntab = NULL; + dyn_module->symtab = NULL; + dyn_module->symnum = 0; + dyn_module->gsymtab_offset = 0; + dyn_module->gstrtab_offset = 0; + dyn_module->c_args = NULL; + dyn_module->argc = 0; + dyn_module->argv = NULL; + dyn_module->loaded_module = NULL; + dyn_module->wrong_endian = 0; + dyn_module->direct_dependent_only = TRUE; + dyn_module->relocatable = FALSE; + dyn_module->relocate_entry_point = TRUE; + + dyn_module->dsbt_size = 0; + dyn_module->dsbt_index = DSBT_INDEX_INVALID; + dyn_module->dsbt_base_tagidx = -1; + + dyn_module->preinit_array_idx = -1; + dyn_module->preinit_arraysz = 0; + dyn_module->init_idx = -1; + dyn_module->init_array_idx = -1; + dyn_module->init_arraysz = 0; + + return dyn_module; +} + +/*****************************************************************************/ +/* detach_loaded_module() */ +/* */ +/* Detach loaded module data structure from given dynamic module. When */ +/* an object file has been successfully loaded, the loader core will */ +/* detach the loaded module data structure from the dynamic module data */ +/* structure because the loaded module must continue to persist until is */ +/* is actually unloaded from target memory. If there is a problem with */ +/* the load, then the host memory associated with the loaded module will */ +/* be released as part of the destruction of the dynamic module. */ +/* */ +/*****************************************************************************/ +static +DLIMP_Loaded_Module *detach_loaded_module(DLIMP_Dynamic_Module *dyn_module) +{ + if (dyn_module && dyn_module->loaded_module) + { + DLIMP_Loaded_Module *loaded_module = dyn_module->loaded_module; + dyn_module->loaded_module = NULL; + return loaded_module; + } + + return NULL; +} +/*****************************************************************************/ +/* delete_DLIMP_Dynamic_Module() */ +/* */ +/* Remove local copies of the string table, symbol table, program header */ +/* table, and dynamic table. */ +/* */ +/*****************************************************************************/ +static void delete_DLIMP_Dynamic_Module(DLOAD_HANDLE handle, + DLIMP_Dynamic_Module **ppdm) +{ + DLIMP_Dynamic_Module *dyn_module = NULL; + + if (!ppdm || (*ppdm == NULL)) + { + DLIF_error(DLET_MISC, + "Internal Error: invalid argument to dynamic module " + "destructor function; aborting loader\n"); + DLIF_exit(1); + } + + dyn_module = *ppdm; + if (dyn_module->name) DLIF_free(dyn_module->name); + if (dyn_module->strtab) DLIF_free(dyn_module->strtab); + if (dyn_module->symtab) DLIF_free(dyn_module->symtab); + if (dyn_module->phdr) DLIF_free(dyn_module->phdr); + if (dyn_module->dyntab) DLIF_free(dyn_module->dyntab); + + /*------------------------------------------------------------------------*/ + /* If we left the loaded module attached to the dynamic module, then */ + /* something must have gone wrong with the load. Remove the loaded */ + /* module from the queue of loaded modules, if it is there. Then free */ + /* the host memory allocated to the loaded module and its segments. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->loaded_module != NULL) + delete_DLIMP_Loaded_Module(handle, &(dyn_module->loaded_module)); + + /*------------------------------------------------------------------------*/ + /* Finally, free the host memory for this dynamic module object and NULL */ + /* the pointer to the object. */ + /*------------------------------------------------------------------------*/ + DLIF_free(dyn_module); + *ppdm = NULL; +} + +/*****************************************************************************/ +/* file_header_magic_number_is_valid() */ +/* */ +/* Given an object file header, check the magic number to ensure that it */ +/* is an object file format that we recognize. This implementation of */ +/* the dynamic loader core will handle ELF object file format. */ +/* */ +/*****************************************************************************/ +static BOOL file_header_magic_number_is_valid(struct Elf32_Ehdr* header) +{ + /*------------------------------------------------------------------------*/ + /* Check for correct ELF magic numbers in file header. */ + /*------------------------------------------------------------------------*/ + if (!header->e_ident[EI_MAG0] == ELFMAG0 || + !header->e_ident[EI_MAG1] == ELFMAG1 || + !header->e_ident[EI_MAG2] == ELFMAG2 || + !header->e_ident[EI_MAG3] == ELFMAG3) + { + DLIF_error(DLET_FILE, "Invalid ELF magic number.\n"); + return FALSE; + } + + return TRUE; +} + +/*****************************************************************************/ +/* file_header_machine_is_valid() */ +/* */ +/* Check if the machine specified in the file header is supported by the */ +/* loader. If the loader was compiled with support for all targets, */ +/* the machine will be initially set to EM_NONE. Once a module has been */ +/* loaded, all remaining modules must have the same machine value. */ +/*****************************************************************************/ +static int file_header_machine_is_valid(Elf32_Half e_machine) +{ + /*------------------------------------------------------------------------*/ + /* Currently we support only ARM or C6x */ + /*------------------------------------------------------------------------*/ + switch(e_machine) + { +#ifdef ARM_TARGET + case EM_ARM : return TRUE; +#endif +#ifdef C60_TARGET + case EM_TI_C6000 : return TRUE; +#endif + + default : return FALSE; + } +} + +/*****************************************************************************/ +/* is_valid_elf_object_file() */ +/* */ +/* Check file size against anticipated end location of string table, */ +/* symbol table, program header tables, etc. If we anything untoward, */ +/* then we declare that the ELF file is corrupt and the load is aborted. */ +/* */ +/*****************************************************************************/ +static BOOL is_valid_elf_object_file(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + uint32_t fsz; + int i; + + /*------------------------------------------------------------------------*/ + /* Get file size. */ + /*------------------------------------------------------------------------*/ + DLIF_fseek(fd, 0, LOADER_SEEK_END); + fsz = DLIF_ftell(fd); + + /*------------------------------------------------------------------------*/ + /* Check for invalid table sizes (string table, symbol table, and */ + /* program header tables). */ + /*------------------------------------------------------------------------*/ + if (!((dyn_module->strsz < fsz) && + (dyn_module->symnum < fsz) && + (dyn_module->phnum * sizeof(struct Elf32_Phdr)) < fsz)) + { + DLIF_error(DLET_FILE, "Invalid ELF table bounds.\n"); + return FALSE; + } + + /*------------------------------------------------------------------------*/ + /* Check for null so_name string in file with dynamic information. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->dyntab && !strcmp(dyn_module->name, "")) + { + DLIF_error(DLET_MISC, "Dynamic file lacks SO_NAME identifier.\n"); + return FALSE; + } + + /*------------------------------------------------------------------------*/ + /* Check for invalid program header information. */ + /*------------------------------------------------------------------------*/ + for (i = 0; i < dyn_module->phnum; i++) + { + struct Elf32_Phdr* phdr = dyn_module->phdr + i; + + /*---------------------------------------------------------------------*/ + /* Sanity check for relative sizes of filesz and memsz. */ + /*---------------------------------------------------------------------*/ + if (!(phdr->p_type != PT_LOAD || phdr->p_filesz <= phdr->p_memsz)) + { + DLIF_error(DLET_MISC, + "Invalid file or memory size for segment %d.\n", i); + return FALSE; + } + + /*---------------------------------------------------------------------*/ + /* Check that segment file offset doesn't go off the end of the file. */ + /*---------------------------------------------------------------------*/ + if (!(phdr->p_offset + phdr->p_filesz < fsz)) + { + DLIF_error(DLET_FILE, + "File location of segment %d is past the end of file.\n", i); + return FALSE; + } + } + + /*------------------------------------------------------------------------*/ + /* Check that a ET_DYN-type file is relocatable. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->fhdr.e_type == ET_DYN && !dyn_module->symtab) return FALSE; + + /*------------------------------------------------------------------------*/ + /* All checks passed. */ + /*------------------------------------------------------------------------*/ + return TRUE; +} + +/*****************************************************************************/ +/* process_eiosabi() */ +/* */ +/* Check the EI_OSABI field to validate it and set any parameters based on */ +/* it. */ +/*****************************************************************************/ +static BOOL process_eiosabi(DLIMP_Dynamic_Module* dyn_module) +{ + return cur_target->process_eiosabi(dyn_module); +} + +/*****************************************************************************/ +/* dload_file_header() */ +/* */ +/* Read ELF file header. Store critical information in the provided */ +/* DLIMP_Dynamic_Module record. Check file header for validity. */ +/* */ +/*****************************************************************************/ +static BOOL dload_file_header(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* Read ELF file header from given input file. */ + /*------------------------------------------------------------------------*/ + DLIF_fread(&(dyn_module->fhdr), sizeof(struct Elf32_Ehdr), 1, fd); + + /*------------------------------------------------------------------------*/ + /* Determine target vs. host endian-ness. Does header data need to be */ + /* byte swapped? */ + /*------------------------------------------------------------------------*/ + dyn_module->wrong_endian = + (dyn_module->fhdr.e_ident[EI_DATA] != DLIMP_get_endian()); + + /*------------------------------------------------------------------------*/ + /* Swap file header structures, if needed. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->wrong_endian) + DLIMP_change_ehdr_endian(&(dyn_module->fhdr)); + + /*------------------------------------------------------------------------*/ + /* Write out magic ELF information for debug purposes. */ + /*------------------------------------------------------------------------*/ +#if LOADER_DEBUG + if (debugging_on) + { + DLIF_trace("ELF: %c%c%c\n", dyn_module->fhdr.e_ident[1], + dyn_module->fhdr.e_ident[2], + dyn_module->fhdr.e_ident[3]); + DLIF_trace("ELF file header entry point: %x\n", + dyn_module->fhdr.e_entry); + } +#endif + + + /*------------------------------------------------------------------------*/ + /* Verify magic numbers in ELF file header. */ + /*------------------------------------------------------------------------*/ + if (!file_header_magic_number_is_valid(&(dyn_module->fhdr))) + { + DLIF_error(DLET_FILE, "Invalid ELF file header magic number.\n"); + return FALSE; + } + + if (!file_header_machine_is_valid(dyn_module->fhdr.e_machine)) + { + DLIF_error(DLET_FILE, "Invalid ELF file target machine.\n"); + return FALSE; + } + + /*------------------------------------------------------------------------*/ + /* Verify file is an executable or dynamic shared object or library. */ + /*------------------------------------------------------------------------*/ + if ((dyn_module->fhdr.e_type != ET_EXEC) && + (dyn_module->fhdr.e_type != ET_DYN)) + { + DLIF_error(DLET_FILE, "Invalid ELF file type.\n"); + return FALSE; + } + +#if LOADER_DEBUG || LOADER_PROFILE + /*------------------------------------------------------------------------*/ + /* Stop profiling clock when file header information has finished */ + /* loading. Re-start clock on initialization of symbol table, and */ + /* dynamic table pointers. */ + /*------------------------------------------------------------------------*/ + if (debugging_on || profiling_on) + { + DLIF_trace("done.\n"); + if (profiling_on) + { + profile_stop_clock(); + DLIF_trace("Took %lu cycles.\n", + (unsigned long)profile_cycle_count()); + profile_start_clock(); + } + } +#endif + + return TRUE; +} + +/*****************************************************************************/ +/* dload_program_header_table() */ +/* */ +/* Make a local copy of the ELF object file's program header table in the */ +/* dynamic module data structure. */ +/* */ +/*****************************************************************************/ +static void dload_program_header_table(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* Read the program header tables from the object file. */ + /*------------------------------------------------------------------------*/ + struct Elf32_Ehdr *fhdr = &(dyn_module->fhdr); + dyn_module->phdr = (struct Elf32_Phdr*) + (DLIF_malloc(fhdr->e_phnum * fhdr->e_phentsize)); + DLIF_fseek(fd, fhdr->e_phoff, LOADER_SEEK_SET); + DLIF_fread(dyn_module->phdr, fhdr->e_phentsize, fhdr->e_phnum,fd); + dyn_module->phnum = fhdr->e_phnum; + + /*------------------------------------------------------------------------*/ + /* Byte swap the program header tables if the target endian-ness is not */ + /* the same as the host endian-ness. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->wrong_endian) + { + int i; + for (i = 0; i < dyn_module->phnum; i++) + DLIMP_change_phdr_endian(dyn_module->phdr + i); + } +} + +/*****************************************************************************/ +/* dload_headers() */ +/* */ +/* Read ELF object file header and program header table information into */ +/* the given dynamic module data structure. If the object file contains */ +/* dynamic information, read in the dynamic tags, dynamic symbol table, */ +/* and global string table. Check to make sure that we are not already */ +/* in the process of loading the module (circular dependencies), then */ +/* perform some level of sanity checking on the content of the file to */ +/* provide some assurance that the file is not corrupted. */ +/* */ +/*****************************************************************************/ +static BOOL dload_headers(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ +#if LOADER_DEBUG || LOADER_PROFILE + /*------------------------------------------------------------------------*/ + /* More progress information. Start timing if profiling is enabled. */ + /*------------------------------------------------------------------------*/ + if (debugging_on || profiling_on) + { + DLIF_trace("\nReading file headers ...\n"); + if (profiling_on) profile_start_clock(); + } +#endif + + /*------------------------------------------------------------------------*/ + /* Read file header information and check vs. expected ELF object file */ + /* header content. */ + /*------------------------------------------------------------------------*/ + if (!dload_file_header(fd, dyn_module)) + return FALSE; + + /*------------------------------------------------------------------------*/ + /* Read program header table information into the dynamic module object. */ + /*------------------------------------------------------------------------*/ + dload_program_header_table(fd, dyn_module); + + /*------------------------------------------------------------------------*/ + /* Once headers have been read in, use e_machine to set virtual target. */ + /* This can then be used to access target specific functions. */ + /*------------------------------------------------------------------------*/ + cur_target = get_vt_obj(dyn_module->fhdr.e_machine); + if (!cur_target) + { + DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n", + dyn_module->name); + return FALSE; + } + + return TRUE; +} + +/*****************************************************************************/ +/* find_dynamic_segment() */ +/* */ +/* Find the dynamic segment in the given ELF object file, if there is */ +/* one. If the segment is found, then the segment ID output parameter */ +/* is set to the index of the dynamic segment in the program header */ +/* table. If the dynamic segment is not found, the dynamic module's */ +/* relocatable flag is set to FALSE, and return FALSE. */ +/* */ +/*****************************************************************************/ +static BOOL find_dynamic_segment(DLIMP_Dynamic_Module *dyn_module, + Elf32_Word *dyn_seg_idx) +{ + int i; + + /*------------------------------------------------------------------------*/ + /* We should have a valid dynamic module pointer and somewhere to put the */ + /* dynamic segment id, if we find one. If either of these are missing, */ + /* we should get an internal error and abort the loader. */ + /*------------------------------------------------------------------------*/ + if ((dyn_module == NULL) || (dyn_seg_idx == NULL)) + { + DLIF_error(DLET_MISC, "Internal error: find_dynamic_segment() needs " + "non-NULL arguments.\n"); + DLIF_exit(1); + } + + /*------------------------------------------------------------------------*/ + /* Spin through segment program headers to find the dynamic segment. */ + /*------------------------------------------------------------------------*/ + dyn_module->relocatable = TRUE; + for (i = 0; i < dyn_module->phnum; i++) + if (dyn_module->phdr[i].p_type == PT_DYNAMIC) + { *dyn_seg_idx = i; return TRUE; } + + /*------------------------------------------------------------------------*/ + /* No dynamic segment found, mark the object module as not relocatable */ + /* and warn the user. */ + /*------------------------------------------------------------------------*/ + dyn_module->relocatable = FALSE; + + return FALSE; +} + +/*****************************************************************************/ +/* copy_dynamic_table() */ +/* */ +/* Make a local copy of the dynamic table read from the dynamic segment */ +/* in the ELF object file. */ +/* */ +/*****************************************************************************/ +static void copy_dynamic_table(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module, + Elf32_Word dyn_seg_idx) +{ + /*------------------------------------------------------------------------*/ + /* Allocate space for the dynamic table from host memory and read its */ + /* content from the ELF object file. */ + /*------------------------------------------------------------------------*/ + Elf32_Word num_elem; + dyn_module->dyntab = DLIF_malloc(dyn_module->phdr[dyn_seg_idx].p_filesz); + num_elem = dyn_module->phdr[dyn_seg_idx].p_filesz / sizeof(struct Elf32_Dyn); + DLIF_fseek(fd, dyn_module->phdr[dyn_seg_idx].p_offset, LOADER_SEEK_SET); + DLIF_fread(dyn_module->dyntab, sizeof(struct Elf32_Dyn), num_elem, fd); + + /*------------------------------------------------------------------------*/ + /* If necessary, byte swap each entry in the dynamic table. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->wrong_endian) + { + int i; + for (i = 0; i < num_elem; i++) + DLIMP_change_dynent_endian(&dyn_module->dyntab[i]); + } +} + +/*****************************************************************************/ +/* process_target_dynamic_tag() */ +/* */ +/* Process a target specific dynamic tag entry. Returns TRUE if the tag */ +/* was handled and FALSE if it was not recognized. */ +/*****************************************************************************/ +static BOOL process_target_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i) +{ + return cur_target->process_dynamic_tag(dyn_module, i); +} + +/*****************************************************************************/ +/* process_dynamic_table() */ +/* */ +/* Process dynamic tag entries from the dynamic table. At the conclusion */ +/* of this function, we should have made a copy of the global symbols */ +/* and the global symbol names. */ +/* */ +/*****************************************************************************/ +static BOOL process_dynamic_table(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + int i; + BOOL soname_found = FALSE; + Elf32_Addr soname_offset = 0; + Elf32_Addr strtab_offset = 0; + Elf32_Addr hash_offset = 0; + Elf32_Addr symtab_offset = 0; + + /*------------------------------------------------------------------------*/ + /* Iterate over the dynamic table in order to process dynamic tags. */ + /* See ELF TIS Specification for details on the meaning of each dynamic */ + /* tag. The C6000 ELF ABI Specification provides more details about the */ + /* TI specific C6000 ELF ABI tags. */ + /*------------------------------------------------------------------------*/ + for (i = 0; dyn_module->dyntab[i].d_tag != DT_NULL; i++) + { + switch(dyn_module->dyntab[i].d_tag) + { + /*------------------------------------------------------------------*/ + /* DT_SONAME: Contains name of dynamic object, used for dependency */ + /* comparisons. Its value is an offset from the start */ + /* of the string table. We need to copy the string at */ + /* this offset into dmodule->name. */ + /*------------------------------------------------------------------*/ + case DT_SONAME: +#if LOADER_DEBUG + if (debugging_on) DLIF_trace("Found SO_NAME.\n"); +#endif + /*---------------------------------------------------------------*/ + /* We store the offset of the so_name in the dynamic string */ + /* table so that it doesn't matter which dynamic tag we see */ + /* first (DT_SONAME actually is generated before DT_STRTAB). */ + /*---------------------------------------------------------------*/ + soname_found = TRUE; + soname_offset = dyn_module->dyntab[i].d_un.d_ptr; + break; + + /*------------------------------------------------------------------*/ + /* DT_STRSZ: Contains the size of the string table. */ + /*------------------------------------------------------------------*/ + case DT_STRSZ: + dyn_module->strsz = dyn_module->dyntab[i].d_un.d_val; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Found string table Size: 0x%x\n", dyn_module->strsz); +#endif + break; + + /*------------------------------------------------------------------*/ + /* DT_STRTAB: Contains the file offset of the string table. The */ + /* tag directly after this is guaranteed to be DT_STRSZ, */ + /* containing the string table size. We need to */ + /* allocate memory for the string table and copy it from */ + /* the file. */ + /*------------------------------------------------------------------*/ + case DT_STRTAB: + strtab_offset = dyn_module->dyntab[i].d_un.d_ptr; +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Found string table: 0x%x\n", strtab_offset); +#endif + break; + + /*------------------------------------------------------------------*/ + /* DT_HASH: Contains the file offset of the symbol hash table. */ + /*------------------------------------------------------------------*/ + case DT_HASH: + hash_offset = dyn_module->dyntab[i].d_un.d_ptr; +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Found symbol hash table: 0x%x\n", hash_offset); +#endif + break; + + /*------------------------------------------------------------------*/ + /* DT_SYMTAB: Contains the file offset of the symbol table. */ + /*------------------------------------------------------------------*/ + case DT_SYMTAB: + symtab_offset = dyn_module->dyntab[i].d_un.d_ptr; +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Found symbol table: 0x%x\n", symtab_offset); +#endif + break; + + /*------------------------------------------------------------------*/ + /* DSO Initialization / Termination Model Dynamic Tags */ + /*------------------------------------------------------------------*/ + /* For initialization tags, we store indices and array sizes in */ + /* the dyn_module. Termination works a little different, the */ + /* indices into the local copy of the dynamic table are stored in */ + /* dyn_module, but the DT_FINI_ARRAYSZ value is recorded with the */ + /* loaded module. */ + /*------------------------------------------------------------------*/ + /* After placement is done, the DT_FINI and DT_FINI_ARRAY values */ + /* need to be copied from the local dynamic table into the loaded */ + /* module object. */ + /*------------------------------------------------------------------*/ + case DT_PREINIT_ARRAY: + dyn_module->preinit_array_idx = i; + break; + + case DT_PREINIT_ARRAYSZ: + dyn_module->preinit_arraysz = dyn_module->dyntab[i].d_un.d_val; + break; + + case DT_INIT: + dyn_module->init_idx = i; + break; + + case DT_INIT_ARRAY: + dyn_module->init_array_idx = i; + break; + + case DT_INIT_ARRAYSZ: + dyn_module->init_arraysz = dyn_module->dyntab[i].d_un.d_val; + break; + + /*------------------------------------------------------------------*/ + /* This information will be copied over to the loaded module */ + /* object after placement has been completed and the information */ + /* in the dynamic table has been relocated. */ + /*------------------------------------------------------------------*/ + case DT_FINI_ARRAY: + case DT_FINI_ARRAYSZ: + case DT_FINI: + break; + + /*------------------------------------------------------------------*/ + /* Unrecognized tag, may not be illegal, but is not explicitly */ + /* handled by this function. Should it be? */ + /*------------------------------------------------------------------*/ + default: + { + if (!process_target_dynamic_tag(dyn_module, i)) + { +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Unrecognized dynamic tag: 0x%X\n", + dyn_module->dyntab[i].d_tag); +#endif + } + + break; + } + + } + } + + /*------------------------------------------------------------------------*/ + /* If string table offset and size were found, read string table in from */ + /* the ELF object file. */ + /*------------------------------------------------------------------------*/ + if (strtab_offset && dyn_module->strsz) + { + DLIF_fseek(fd, strtab_offset, LOADER_SEEK_SET); + dyn_module->strtab = DLIF_malloc(dyn_module->strsz); + DLIF_fread(dyn_module->strtab, sizeof(uint8_t), dyn_module->strsz, fd); + } + else + { + DLIF_warning(DLWT_MISC, + "Mandatory dynamic tag DT_STRTAB/DT_STRSZ not found!\n"); + return FALSE; + } + + + /*------------------------------------------------------------------------*/ + /* If symbol hash table is found read-in the hash table. */ + /*------------------------------------------------------------------------*/ + if (hash_offset) + { + /*---------------------------------------------------------------------*/ + /* Hash table has the following format. nchain equals the number of */ + /* entries in the symbol table (symnum) */ + /* */ + /* +----------------------------+ */ + /* | nbucket | */ + /* +----------------------------+ */ + /* | nchain | */ + /* +----------------------------+ */ + /* | bucket[0] | */ + /* | ... | */ + /* | bucket[nbucket-1] | */ + /* +----------------------------+ */ + /* | chain[0] | */ + /* | ... | */ + /* | chain[nchain-1] | */ + /* +----------------------------+ */ + /*---------------------------------------------------------------------*/ + Elf32_Word hash_nbucket; + Elf32_Word hash_nchain; + + /*---------------------------------------------------------------------*/ + /* Seek to the hash offset and read first two words into nbucket and */ + /* symnum. */ + /*---------------------------------------------------------------------*/ + DLIF_fseek(fd, hash_offset, LOADER_SEEK_SET); + DLIF_fread(&(hash_nbucket), sizeof(Elf32_Word), 1, fd); + DLIF_fread(&(hash_nchain), sizeof(Elf32_Word), 1, fd); + if (dyn_module->wrong_endian) + { + DLIMP_change_endian32((int32_t*)(&(hash_nbucket))); + DLIMP_change_endian32((int32_t*)(&(hash_nchain))); + } + + /*---------------------------------------------------------------------*/ + /* The number of entires in the dynamic symbol table is not encoded */ + /* anywhere in the elf file. However, the nchain is guaranteed to be */ + /* the same as the number of symbols. Use nchain to set the symnum. */ + /*---------------------------------------------------------------------*/ + dyn_module->symnum = hash_nchain; +#if LOADER_DEBUG + if (debugging_on) DLIF_trace("symnum=%d\n", hash_nchain); +#endif + } + else + { + DLIF_warning(DLWT_MISC, "Mandatory dynamic tag DT_HASH is not found!\n"); + return FALSE; + } + + /*------------------------------------------------------------------------*/ + /* Read dynamic symbol table. */ + /*------------------------------------------------------------------------*/ + if (symtab_offset) + { + int j = 0; + DLIF_fseek(fd, symtab_offset, LOADER_SEEK_SET); + dyn_module->symtab = + DLIF_malloc(dyn_module->symnum * sizeof(struct Elf32_Sym)); + DLIF_fread(dyn_module->symtab, sizeof(struct Elf32_Sym), + dyn_module->symnum, fd); + if (dyn_module->wrong_endian) + { + for (j = 0; j < dyn_module->symnum; j++) + DLIMP_change_sym_endian(dyn_module->symtab + j); + } + + /*---------------------------------------------------------------------*/ + /* The st_name field of an Elf32_Sym entity is an offset into the */ + /* string table. Convert it into a pointer to the string. */ + /*---------------------------------------------------------------------*/ + if (strtab_offset) + for (j = 0; j < dyn_module->symnum; j++) + dyn_module->symtab[j].st_name += (Elf32_Word) dyn_module->strtab; + } + else + { + DLIF_warning(DLWT_MISC, + "Mandatory dynamic tag DT_SYMTAB is not found!\n"); + return FALSE; + } + + /*------------------------------------------------------------------------*/ + /* Read the SONAME. */ + /*------------------------------------------------------------------------*/ + if (!soname_found) + { + DLIF_warning(DLWT_MISC, "Dynamic tag DT_SONAME is not found!\n"); + dyn_module->name = DLIF_malloc(sizeof(char)); + *dyn_module->name = '\0'; + } + else + { + dyn_module->name = + DLIF_malloc(strlen(dyn_module->strtab + soname_offset) + 1); + strcpy(dyn_module->name, dyn_module->strtab + soname_offset); + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("Name of dynamic object: %s\n", dyn_module->name); +#endif + } + + return TRUE; +} + + +/*****************************************************************************/ +/* dload_dynamic_information() */ +/* */ +/* Given a dynamic module with a dynamic segment which is located via */ +/* given dynamic segment index, make a local copy of the dynamic table */ +/* in the dynamic module object, then process the dynamic tag entries in */ +/* the table. */ +/* */ +/*****************************************************************************/ +static BOOL dload_dynamic_information(LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module, + Elf32_Word dyn_seg_idx) +{ + /*------------------------------------------------------------------------*/ + /* Read a copy of the dynamic table into the dynamic module object. */ + /*------------------------------------------------------------------------*/ + copy_dynamic_table(fd, dyn_module, dyn_seg_idx); + + /*------------------------------------------------------------------------*/ + /* Process dynamic entries in the dynamic table. If any problems are */ + /* encountered, the loader should emit an error or warning and return */ + /* FALSE here. */ + /*------------------------------------------------------------------------*/ + return process_dynamic_table(fd, dyn_module); +} + +/*****************************************************************************/ +/* check_circular_dependency() */ +/* */ +/* Determine whether a dynamic module is already in the process of being */ +/* loaded before we try to start loading it again. If it is already */ +/* being loaded, then the dynamic loader has detected a circular */ +/* dependency. An error will be emitted and the load will be aborted. */ +/* */ +/*****************************************************************************/ +static BOOL check_circular_dependency(DLOAD_HANDLE handle, + const char *dyn_mod_name) +{ + /*------------------------------------------------------------------------*/ + /* Check the name of the given dependency module to be loaded against the */ + /* list of modules that are currently in the process of being loaded. */ + /* Report an error if any circular dependencies are detected. */ + /*------------------------------------------------------------------------*/ + int i; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (i = 0; i < pHandle->DLIMP_module_dependency_list.size; i++) + if (!strcmp(dyn_mod_name, + ((char**)(pHandle->DLIMP_module_dependency_list.buf))[i])) + { + DLIF_error(DLET_MISC, + "Circular dependency detected, '%s' is already in the " + "process of loading.\n", dyn_mod_name); + return FALSE; + } + + return TRUE; +} + +/*****************************************************************************/ +/* dload_dynamic_segment() */ +/* */ +/* Find the dynamic segment in the given ELF module, if there is one. */ +/* If there is a dynamic segment, then make a local copy of the dynamic */ +/* table in the dynamic module object provided, then process the dynamic */ +/* tag entries in the table. */ +/* */ +/* If there is no dynamic segment, then we return success from this */ +/* function, marking the dynamic module as "not relocatable". */ +/* */ +/*****************************************************************************/ +static BOOL dload_dynamic_segment(DLOAD_HANDLE handle, + LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* If we don't find dynamic segment, the relocatable flag will have been */ + /* set to false to indicate that the module is a static executable. We */ + /* still return TRUE from this function so that we can proceed with */ + /* static loading. */ + /*------------------------------------------------------------------------*/ + Elf32_Word dyn_seg_idx = 0; + if (!find_dynamic_segment(dyn_module, &dyn_seg_idx)) + return TRUE; + + /*------------------------------------------------------------------------*/ + /* Process the OSABI now, after we know if the module is relocatable. */ + /*------------------------------------------------------------------------*/ + if (!process_eiosabi(dyn_module)) + { + DLIF_error(DLET_FILE, "Unsupported EI_OSABI value.\n"); + return FALSE; + } + + /*------------------------------------------------------------------------*/ + /* Read the dynamic table from the ELF file, then process the dynamic */ + /* tags in the table. */ + /*------------------------------------------------------------------------*/ + if (!dload_dynamic_information(fd, dyn_module, dyn_seg_idx)) + return FALSE; + + /*------------------------------------------------------------------------*/ + /* Check to make sure that this module is not already being loaded. If */ + /* is, then it will cause a circular dependency to be introduced. */ + /* Loader should detect circular dependencies and emit an error. */ + /*------------------------------------------------------------------------*/ + if (!check_circular_dependency(handle, dyn_module->name)) + return FALSE; + + return TRUE; +} + +/*****************************************************************************/ +/* COPY_SEGMENTS() - */ +/* */ +/* Copy all segments into host memory. */ +/*****************************************************************************/ +static void copy_segments(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp, + DLIMP_Dynamic_Module* dyn_module) +{ + DLIMP_Loaded_Segment* seg = + (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf); + int s, seg_size = dyn_module->loaded_module->loaded_segments.size; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + + for (s=0; s<seg_size; s++) + { + struct DLOAD_MEMORY_REQUEST targ_req; + targ_req.fp = fp; + targ_req.segment = seg[s].obj_desc; + targ_req.offset = seg[s].phdr.p_offset; + targ_req.flags = DLOAD_SF_relocatable; + + if (seg[s].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable; + if (seg[s].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable; + + targ_req.align = seg[s].phdr.p_align; + + /*---------------------------------------------------------------------*/ + /* Copy segment data from the file into host buffer where it can */ + /* be relocated. */ + /*---------------------------------------------------------------------*/ + DLIF_copy(pHandle->client_handle, &targ_req); + seg[s].host_address = targ_req.host_address; + } +} + +/*****************************************************************************/ +/* WRITE_SEGMENTS() - */ +/* */ +/* Write all segments to target memory. */ +/*****************************************************************************/ +static void write_segments(DLOAD_HANDLE handle, + LOADER_FILE_DESC* fp, + DLIMP_Dynamic_Module* dyn_module) +{ + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + DLIMP_Loaded_Segment* seg = + (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf); + int s, seg_size = dyn_module->loaded_module->loaded_segments.size; + + for (s=0; s<seg_size; s++) + { + struct DLOAD_MEMORY_REQUEST targ_req; + + targ_req.fp = fp; + targ_req.segment = seg[s].obj_desc; + targ_req.offset = seg[s].phdr.p_offset; + targ_req.flags = DLOAD_SF_relocatable; + + if (seg[s].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable; + if (seg[s].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable; + + targ_req.align = seg[s].phdr.p_align; + targ_req.host_address = seg[s].host_address; + + /*---------------------------------------------------------------------*/ + /* Copy segment data from the file into host buffer where it can */ + /* be relocated. */ + /*---------------------------------------------------------------------*/ + DLIF_write(pHandle->client_handle, &targ_req); + } +} + +/*****************************************************************************/ +/* SEG_HAS_SPACE_FOR_WRITE() - */ +/* */ +/* Check if segment has enough space to recieve contents of .args section. */ +/*****************************************************************************/ +static BOOL seg_has_space_for_write(DLIMP_Loaded_Module* lmodule, int sz) +{ + DLIMP_Loaded_Segment* seg = + (DLIMP_Loaded_Segment*)(lmodule->loaded_segments.buf); + int s, seg_size = lmodule->loaded_segments.size; + + Elf32_Addr write_address = (Elf32_Addr)lmodule->c_args; + + for (s=0; s<seg_size; s++) + { + Elf32_Addr seg_boundary = + seg[s].phdr.p_vaddr + seg[s].obj_desc->memsz_in_bytes; + + /*---------------------------------------------------------------------*/ + /* If address to write to is greater than segment addr and less than */ + /* segment end, it must lie in current segment. */ + /*---------------------------------------------------------------------*/ + if ((write_address >= seg[s].phdr.p_vaddr) && + (write_address < seg_boundary)) + { + if ((write_address + sz) > seg_boundary) + { +#if LOADER_DEBUG + if (debugging_on) + { + DLIF_trace("Write requires 0x%x bytes\n",write_address + sz); + DLIF_trace("Seg boundary at : 0x%x\n",seg_boundary); + DLIF_trace("WARNING - Not enough space in segment\n"); + } +#endif + return FALSE; + } + else return TRUE; + } + } + /*------------------------------------------------------------------------*/ + /* Given address doesn't belong to any known segment. */ + /*------------------------------------------------------------------------*/ + return FALSE; +} + + +/*****************************************************************************/ +/* DLOAD_initialize() */ +/* */ +/* Construct and initialize data structures internal to the dynamic */ +/* loader core. */ +/* */ +/*---------------------------------------------------------------------------*/ +/* */ +/* This function is deprecated, replaced by DLOAD_create(). */ +/* */ +/*****************************************************************************/ +void DLOAD_initialize(DLOAD_HANDLE handle) +{ +} + +/*****************************************************************************/ +/* DLOAD_finalize() */ +/* */ +/* Destroy and finalize data structures internal to the dynamic */ +/* loader core. */ +/* */ +/*---------------------------------------------------------------------------*/ +/* */ +/* This function is deprecated, replaced by DLOAD_destroy(). */ +/* */ +/*****************************************************************************/ +void DLOAD_finalize(DLOAD_HANDLE handle) +{ +} + +/*****************************************************************************/ +/* dload_static_executable() */ +/* */ +/* Account for target memory allocated to static executable and wrap up */ +/* loading. No relocation is necessary. */ +/* */ +/*****************************************************************************/ +static int32_t dload_static_executable(DLOAD_HANDLE handle, + LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + int32_t local_file_handle = 0; + +#if LOADER_DEBUG + if (debugging_on) DLIF_trace("Starting dload_static_executable() ...\n"); +#endif + + /*------------------------------------------------------------------------*/ + /* Set entry point for static executable and attempt to allocate target */ + /* memory for the static executable. */ + /*------------------------------------------------------------------------*/ + dyn_module->loaded_module->entry_point = dyn_module->fhdr.e_entry; + if (load_static_segment(handle, fd, dyn_module) && + load_object(fd, dyn_module)) + { + /*---------------------------------------------------------------------*/ + /* If successful, we'll want to detach the loaded module object from */ + /* the dynamic module object that created it. Take note of the file */ + /* handle. */ + /*---------------------------------------------------------------------*/ + DLIMP_Loaded_Module *loaded_module = detach_loaded_module(dyn_module); + local_file_handle = loaded_module->file_handle; + } + + /*------------------------------------------------------------------------*/ + /* Static load failed. Flag an error. */ + /*------------------------------------------------------------------------*/ + else + DLIF_error(DLET_MEMORY, + "Failed to allocate target memory for static executable.\n"); + + /*------------------------------------------------------------------------*/ + /* Destruct dynamic module object. */ + /*------------------------------------------------------------------------*/ + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + +#if LOADER_DEBUG + if (debugging_on) DLIF_trace("Finished dload_static_executable()\n"); +#endif + + return local_file_handle; +} + +#if LOADER_DEBUG || LOADER_PROFILE +int DLREL_relocations; +time_t DLREL_total_reloc_time; +#endif + +/*****************************************************************************/ +/* process_dynamic_module_relocations() */ +/* */ +/* Make a host-accessible copy of all of the segments, process all */ +/* relocation entries associated with the given module within that */ +/* space, then write the updated segment buffers back out to target */ +/* memory. */ +/* */ +/*****************************************************************************/ +static void process_dynamic_module_relocations(DLOAD_HANDLE handle, + LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ +#if LOADER_DEBUG || LOADER_PROFILE + if(debugging_on || profiling_on) + { + DLIF_trace("Running relocate()...\n"); + if (profiling_on) profile_start_clock(); + } +#endif + + /*------------------------------------------------------------------------*/ + /* Copy segments from file to host memory */ + /*------------------------------------------------------------------------*/ + copy_segments(handle, fd, dyn_module); + + /*------------------------------------------------------------------------*/ + /* Process dynamic relocations. */ + /*------------------------------------------------------------------------*/ + DLREL_relocate(handle, fd, dyn_module); + + /*------------------------------------------------------------------------*/ + /* Write segments from host memory to target memory */ + /*------------------------------------------------------------------------*/ + write_segments(handle, fd, dyn_module); + +#if LOADER_DEBUG || LOADER_PROFILE + /*------------------------------------------------------------------------*/ + /* Report timing and progress information for relocation step. */ + /*------------------------------------------------------------------------*/ + if (debugging_on || profiling_on) + { + if (profiling_on) + { + profile_stop_clock(); + DLIF_trace("Took %lu cycles.\n", + (unsigned long) profile_cycle_count()); + DLIF_trace("Total reloc time: %lu\n", + (unsigned long) DLREL_total_reloc_time); + DLIF_trace("Time per relocation: %ld\n", + DLREL_relocations ? DLREL_total_reloc_time / DLREL_relocations : 0); + } + + DLIF_trace("Number of relocations: %d\n", DLREL_relocations); + DLIF_trace("\nAbout to run load_object()..."); + DLREL_total_reloc_time = DLREL_relocations = 0; + if (profiling_on) profile_start_clock(); + } +#endif + +} + +/*****************************************************************************/ +/* store_preinit_data() */ +/* */ +/* Given a dynamic module object, store pre-initialization function */ +/* information. The user may also provide a custom iniitialization */ +/* function that needs to be executed before the compiler */ +/* generated static initialization functions are executed. */ +/* The dynamic loader will now create a table TI_init_table to store */ +/* pre-init and init data. This is done because pre-init and */ +/* init functions could reference as-yet unrelocated symbols from other */ +/* modules. As such it is safer to store relevant function addresses and */ +/* execute them only after all modules are relocated (CQ34088). */ +/* */ +/*****************************************************************************/ +static void store_preinit_data(DLIMP_Dynamic_Module *dyn_module) +{ + IF_single_record *preinit_rec = NULL; + /*------------------------------------------------------------------------*/ + /* Check for presence of DT_PREINIT_ARRAY and DT_PREINIT_ARRAYSZ */ + /* dynamic tags associated with this module. The dyn_module object will */ + /* hold the relevant indices into the local copy of the dynamic table. */ + /* The value of the DT_INIT_ARRAY tag will have been updated after */ + /* placement of the module was completed. Arrays of size 0 will be */ + /* ignored (CQ36935). */ + /*------------------------------------------------------------------------*/ + if (dyn_module->preinit_arraysz > 0) + { + preinit_rec = (IF_single_record *)DLIF_malloc(sizeof(IF_single_record)); + /*---------------------------------------------------------------------*/ + /* Retrieve the address of the .preinit_array section from the value */ + /* of the DT_PREINIT_ARRAY tag, and store it in the TI_init_table. */ + /*---------------------------------------------------------------------*/ + preinit_rec->size = dyn_module->preinit_arraysz; + preinit_rec->sect_addr = (TARGET_ADDRESS) + (dyn_module->dyntab[dyn_module->preinit_array_idx].d_un.d_ptr); + } + + if (preinit_rec) IF_table_enqueue(&TI_init_table, preinit_rec); +} + +/*****************************************************************************/ +/* store_init_data() */ +/* */ +/* Given a dynamic module object, save off initialization function(s) for */ +/* all global and static data objects that are defined in the module */ +/* which require construction. The dynamic loader will now create a table */ +/* TI_init_table to store pre-init and init data. This is done because */ +/* pre-init and init functions could reference as-yet unrelocated symbols */ +/* from other modules. As such it is safer to store relevant function */ +/* addresses and execute them only after all modules are relocated. */ +/* */ +/*****************************************************************************/ +static void store_init_data(DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* Check for presence of a DT_INIT dynamic tag associated with this */ + /* module. The dynamic module will hold the index into the local copy of */ + /* the dynamic table. This entry in the dynamic table will have been */ + /* updated after placement of the module is completed. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->init_idx != -1) + { + IF_single_record *init_rec = + (IF_single_record *)DLIF_malloc(sizeof(IF_single_record)); + /*---------------------------------------------------------------------*/ + /* Retrieve the address of the initialization function from the value */ + /* of the DT_INIT tag, and get the client to execute the function. */ + /*---------------------------------------------------------------------*/ + init_rec->size = 0; + init_rec->sect_addr = (TARGET_ADDRESS) + (dyn_module->dyntab[dyn_module->init_idx].d_un.d_ptr); + + IF_table_enqueue(&TI_init_table, init_rec); + } + + /*------------------------------------------------------------------------*/ + /* Check for presence of a DT_INIT_ARRAY and DT_INIT_ARRAYSZ dynamic tags */ + /* associated with this module. The dyn_module object will hold the */ + /* relevant indices into the local copy of the dynamic table. The value */ + /* of the DT_INIT_ARRAY tag will have been updated after placement of the */ + /* module was completed. Arraysz must be a postive number > 0, else it */ + /* be ignored (CQ36935). */ + /*------------------------------------------------------------------------*/ + if (dyn_module->init_arraysz > 0) + { + IF_single_record *arr_rec = + (IF_single_record *)DLIF_malloc(sizeof(IF_single_record)); + /*---------------------------------------------------------------------*/ + /* Retrieve the address of the .init_array section from the value of */ + /* DT_INIT_ARRAY tag. */ + /*---------------------------------------------------------------------*/ + arr_rec->size = dyn_module->init_arraysz; + arr_rec->sect_addr = (TARGET_ADDRESS) + (dyn_module->dyntab[dyn_module->init_array_idx].d_un.d_ptr); + + IF_table_enqueue(&TI_init_table, arr_rec); + } +} + +/*****************************************************************************/ +/* execute_module_initialization() */ +/* */ +/* Given a dynamic module object, execute pre-initialization and */ +/* initialization function(s) for all global and static data objects that */ +/* are defined in the module which require construction. The user may */ +/* also provide a custom iniitialization function that needs to be */ +/* executed before the compiler generated static initialization functions */ +/* are executed. */ +/* Note that the functions to be executed have already been saved off in */ +/* the TI_init_table, by store_preinit_data() and store_init_data(). */ +/* */ +/*****************************************************************************/ +static void execute_module_initialization(DLOAD_HANDLE handle) +{ + IF_single_record *val = NULL; + IF_table_Queue_Node *curr_ptr = TI_init_table.front_ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (; curr_ptr; curr_ptr = curr_ptr->next_ptr) + { + val = curr_ptr->value; + + /*---------------------------------------------------------------------*/ + /* A size of 0 indicates DT_INIT, otherwise this is an ARRAY. */ + /*---------------------------------------------------------------------*/ + if (val->size != 0) + { + /*------------------------------------------------------------------*/ + /* Now make a loader-accessible copy of the .init_array section. */ + /*------------------------------------------------------------------*/ + int32_t i; + int32_t num_init_fcns = val->size/sizeof(TARGET_ADDRESS); + TARGET_ADDRESS *init_array_buf = (TARGET_ADDRESS *) + DLIF_malloc(val->size); + + DLIF_read(pHandle->client_handle, + init_array_buf, 1, val->size, + (TARGET_ADDRESS)val->sect_addr); + + /*------------------------------------------------------------------*/ + /* Call each function whose address occupies an entry in array in */ + /* the order that they appear in the array. The size of the array is*/ + /* provided by the init_arraysz field in the dynamic module (copied */ + /* earlier when the dynamic table was read in). Make sure that */ + /* function addresses are valid before execution. */ + /*------------------------------------------------------------------*/ + for (i = 0; i < num_init_fcns; i++) + if (init_array_buf[i]) + DLIF_execute(pHandle->client_handle, + (TARGET_ADDRESS)(init_array_buf[i])); + else + DLIF_warning(DLWT_MISC, + "DT_INIT_ARRAY/DT_PREINIT_ARRAY function address is NULL!"); + + DLIF_free(init_array_buf); + } + else + { + if (val->sect_addr) + DLIF_execute(pHandle->client_handle, + (TARGET_ADDRESS)(val->sect_addr)); + else + DLIF_warning(DLWT_MISC, "DT_INIT function address is NULL!"); + } + } +} + +/*****************************************************************************/ +/* adjust_module_init_fini() */ +/* If the dynamic loader need not process the module initialization */ +/* and termination (fini section) then adjust the module info so that */ +/* the respective sizes become zero. */ +/*****************************************************************************/ +static void adjust_module_init_fini(DLIMP_Dynamic_Module *dm) +{ + /*------------------------------------------------------------------------*/ + /* The C6x RTS boot code has the function _c_int00 which performs */ + /* the C/C++ initialization. This function processes the .init_array */ + /* to perform the C/C++ initialization and handles termination through */ + /* the at_exit functionality. If the dynamic executable we are loading */ + /* includes _c_int00, the loader assumes that the application code takes */ + /* care of all initialization and termination. Hence the loader won't */ + /* perform the initialization and termination. */ + /* NOTE: Use of __TI_STACK_SIZE is a hack. The _c_int00 symbol is not */ + /* in the dynamic symbol table. The right fix is for the linker */ + /* not to generate the init array tags if the build includes RTS */ + /* boot routine. */ + /*------------------------------------------------------------------------*/ + if (dm->fhdr.e_type == ET_EXEC && + DLSYM_lookup_local_symtab("__TI_STACK_SIZE", dm->symtab, dm->symnum, + NULL)) + { + dm->init_arraysz = 0; + dm->init_array_idx = -1; + + dm->preinit_arraysz = 0; + dm->preinit_array_idx = -1; + + dm->loaded_module->fini_arraysz = 0; + dm->loaded_module->fini_array = (Elf32_Addr) NULL; + dm->loaded_module->fini = (Elf32_Addr) NULL; + } +} + +/*****************************************************************************/ +/* relocate_dependency_graph_modules() */ +/* */ +/* For each dynamic module on the dependency stack, process dynamic */ +/* relocation entries then perform initialization for all global and */ +/* static objects that are defined in tha given module. The stack is */ +/* emptied from the top (LIFO). Each dynamic module object is popped */ +/* off the top of the stack, the module gets relocated, its global and */ +/* static objects that need to be constructed will be constructed, and */ +/* then, after detaching the loaded module object from its dynamic */ +/* module, the dynamic module object is destructed. */ +/* */ +/*****************************************************************************/ +static +int32_t relocate_dependency_graph_modules(DLOAD_HANDLE handle, + LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module) +{ + /*------------------------------------------------------------------------*/ + /* Processing of relocations will only be triggered when this function */ + /* is called from the top-level object module (at the bottom of the */ + /* dependency graph stack). */ + /*------------------------------------------------------------------------*/ + int32_t local_file_handle = dyn_module->loaded_module->file_handle; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + dynamic_module_ptr_Stack_Node *ptr = + pHandle->DLIMP_dependency_stack.bottom_ptr; + if (ptr && (ptr->value != dyn_module)) return local_file_handle; + + if (is_dsbt_module(dyn_module)) + { + /*--------------------------------------------------------------------*/ + /* Assign DSBT indices. */ + /*--------------------------------------------------------------------*/ + DLIF_assign_dsbt_indices(); + + /*--------------------------------------------------------------------*/ + /* Update the content of all DSBTs for any module that uses the */ + /* DSBT model. */ + /*--------------------------------------------------------------------*/ + DLIF_update_all_dsbts(); + } + + /*------------------------------------------------------------------------*/ + /* Ok, we are ready to process relocations. The relocation tables */ + /* associated with dependent files will be processed first. Consume */ + /* dynamic module objects from the dependency graph stack from dependents */ + /* to the root of the dependency graph. */ + /*------------------------------------------------------------------------*/ + while (pHandle->DLIMP_dependency_stack.size > 0) + { + DLIMP_Dynamic_Module *dyn_mod_ptr = + dynamic_module_ptr_pop(&pHandle->DLIMP_dependency_stack); + + /*---------------------------------------------------------------------*/ + /* Process dynamic relocations associated with this module. */ + /*---------------------------------------------------------------------*/ + process_dynamic_module_relocations(handle, dyn_mod_ptr->fd, dyn_mod_ptr); + + /*---------------------------------------------------------------------*/ + /* __c_args__ points to the beginning of the .args section, if there */ + /* is one. Record this pointer in the ELF file internal data object. */ + /* Also store this in the loaded module, since this will be needed to */ + /* write argv, argc to .args at execution time. */ + /*---------------------------------------------------------------------*/ + DLSYM_lookup_local_symtab("__c_args__", dyn_mod_ptr->symtab, + dyn_mod_ptr->symnum, + (Elf32_Addr *)&dyn_mod_ptr->c_args); + dyn_mod_ptr->loaded_module->c_args = dyn_mod_ptr->c_args; + + /*---------------------------------------------------------------------*/ + /* Pick up entry point address from ELF file header. */ + /* We currently only support a single entry point into the ELF file. */ + /* To support Braveheart notion of nodes, with multiple entry points,*/ + /* we'll need to get the list of entry points associated with a node,*/ + /* then add capability to the "execute" command to select the entry */ + /* point that we want to start executing from. */ + /*---------------------------------------------------------------------*/ + dyn_mod_ptr->loaded_module->entry_point = dyn_mod_ptr->fhdr.e_entry; + + /*---------------------------------------------------------------------*/ + /* Copy command-line arguments into args section and deal with DSBT */ + /* issues (copy DSBT to its run location). */ + /* Note that below function is commented out because this doesn't do */ + /* much as of now. */ + /*---------------------------------------------------------------------*/ + //load_object(dyn_mod_ptr->fd, dyn_mod_ptr); + + /*---------------------------------------------------------------------*/ + /* Perform initialization, if needed, for this module. */ + /*---------------------------------------------------------------------*/ + store_init_data(dyn_mod_ptr); + + /*---------------------------------------------------------------------*/ + /* Free all dependent file pointers. */ + /*---------------------------------------------------------------------*/ + if (dyn_mod_ptr->fd != fd) + { + DLIF_fclose(dyn_mod_ptr->fd); + dyn_mod_ptr->fd = NULL; + } + + /*---------------------------------------------------------------------*/ + /* Detach loaded module object from the dynamic module object that */ + /* created it, then throw away the dynamic module object. */ + /*---------------------------------------------------------------------*/ + detach_loaded_module(dyn_mod_ptr); + delete_DLIMP_Dynamic_Module(handle, &dyn_mod_ptr); + } + + return local_file_handle; +} + +/*****************************************************************************/ +/* DLOAD_load() */ +/* */ +/* Dynamically load the specified file and return a file handle for the */ +/* loaded file. If the load fails, this function will return a value of */ +/* zero (0) for the file handle. */ +/* */ +/* The core loader must have read access to the file pointed to by fd. */ +/* */ +/*****************************************************************************/ +int32_t DLOAD_load(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd) +{ + int32_t fl_handle; + + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + DLIMP_Dynamic_Module *dyn_module = new_DLIMP_Dynamic_Module(fd); + + if (!dyn_module) + return 0; + +#if LOADER_DEBUG + /*------------------------------------------------------------------------*/ + /* Spit out some loader progress information when we begin loading an */ + /* object. */ + /*------------------------------------------------------------------------*/ + if (debugging_on) DLIF_trace("Loading file...\n"); +#endif + + /*------------------------------------------------------------------------*/ + /* If no access to a program was provided, there is nothing to do. */ + /*------------------------------------------------------------------------*/ + if (!fd) + { + DLIF_error(DLET_FILE, "Missing file specification.\n"); + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + return 0; + } + + /*------------------------------------------------------------------------*/ + /* Read file headers and dynamic information into dynamic module. */ + /*------------------------------------------------------------------------*/ + if (!dload_headers(fd, dyn_module)) + { + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + return 0; + } + + /*------------------------------------------------------------------------*/ + /* Find the dynamic segment, if there is one, and read dynamic */ + /* information from the ELF object file into the dynamic module data */ + /* structure associated with this file. */ + /*------------------------------------------------------------------------*/ + if (!dload_dynamic_segment(handle, fd, dyn_module)) + return 0; + + /*------------------------------------------------------------------------*/ + /* Perform sanity checking on the read-in ELF file. */ + /*------------------------------------------------------------------------*/ + if (!is_valid_elf_object_file(fd, dyn_module)) + { + DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n", + dyn_module->name); + return 0; + } + +#if LOADER_DEBUG || LOADER_PROFILE + /*------------------------------------------------------------------------*/ + /* Stop clock on initialization of ELF file information. Start clock on */ + /* initialization of ELF module. */ + /*------------------------------------------------------------------------*/ + if (debugging_on || profiling_on) + { + DLIF_trace("Finished dload_dynamic_segment.\n"); + if (profiling_on) + { + profile_stop_clock(); + DLIF_trace("Took %lu cycles.\n", + (unsigned long) profile_cycle_count()); + } + } +#endif + + /*------------------------------------------------------------------------*/ + /* Initialize internal ELF module and segment structures. Sets */ + /* loaded_module in *dyn_module. This also deals with assigning a file */ + /* handle and bumping file handle counter. */ + /*------------------------------------------------------------------------*/ + initialize_loaded_module(handle, dyn_module); + + /*------------------------------------------------------------------------*/ + /* Append Module structure to loaded object list. */ + /*------------------------------------------------------------------------*/ + loaded_module_ptr_enqueue(&pHandle->DLIMP_loaded_objects, + dyn_module->loaded_module); + + /*------------------------------------------------------------------------*/ + /* Support static loading as special case. */ + /*------------------------------------------------------------------------*/ + if (!dyn_module->relocatable) + return dload_static_executable(handle, fd, dyn_module); + + /*------------------------------------------------------------------------*/ + /* Get space & address for segments, and offset symbols and program */ + /* header table to reflect the relocated address. Also offset the */ + /* addresses in the internal Segment structures used by the Module */ + /* structure. Note that this step needs to be performed prior and in */ + /* addition to the relocation entry processing. */ + /*------------------------------------------------------------------------*/ + if (!allocate_dynamic_segments_and_relocate_symbols(handle, fd, dyn_module)) + return 0; + + /*------------------------------------------------------------------------*/ + /* __c_args__ points to the beginning of the .args section, if there is */ + /* one. __TI_STATIC_BASE points to the beginning of the DP-relative data */ + /* segment (value to initialize DP). Record these addresses in the ELF */ + /* file internal data object. */ + /*------------------------------------------------------------------------*/ + DLSYM_lookup_local_symtab("__c_args__", dyn_module->symtab, + dyn_module->symnum, + (Elf32_Addr *)&dyn_module->c_args); + + DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab, + dyn_module->symnum, + (Elf32_Addr *)&dyn_module->static_base); + dyn_module->loaded_module->static_base = dyn_module->static_base; + + /*------------------------------------------------------------------------*/ + /* If the user application performs initialization and termination, */ + /* the dynamic loader shouldn't process the init/fini sections. */ + /* Check and adjust the init/fini information accordingly. */ + /*------------------------------------------------------------------------*/ + adjust_module_init_fini(dyn_module); + + /*------------------------------------------------------------------------*/ + /* Execute any user defined pre-initialization functions that may be */ + /* associated with a dynamic executable module. */ + /*------------------------------------------------------------------------*/ + if (dyn_module->fhdr.e_type == ET_EXEC) + store_preinit_data(dyn_module); + + /*------------------------------------------------------------------------*/ + /* Append current ELF file to list of objects currently loading. */ + /* This is used to detect circular dependencies while we are processing */ + /* the dependents of this file. */ + /*------------------------------------------------------------------------*/ + AL_append(&pHandle->DLIMP_module_dependency_list, &dyn_module->name); + + /*------------------------------------------------------------------------*/ + /* Push this dynamic module object onto the dependency stack. */ + /* All of the modules on the stack will get relocated after all of the */ + /* dependent files have been loaded and allocated. */ + /*------------------------------------------------------------------------*/ + dynamic_module_ptr_push(&pHandle->DLIMP_dependency_stack, dyn_module); + + /*------------------------------------------------------------------------*/ + /* If this object file uses the DSBT model, then register a DSBT index */ + /* request with the client's DSBT support management. */ + /*------------------------------------------------------------------------*/ + if (is_dsbt_module(dyn_module) && + !DLIF_register_dsbt_index_request(handle, + dyn_module->name, + dyn_module->loaded_module->file_handle, + dyn_module->dsbt_index)) + return 0; + + /*------------------------------------------------------------------------*/ + /* Load this ELF file's dependees (all files on its DT_NEEDED list). */ + /* Dependees must be loaded and relocated before processing this module's */ + /* relocations. */ + /*------------------------------------------------------------------------*/ + if (!dload_and_allocate_dependencies(handle, dyn_module)) + return 0; + + /*------------------------------------------------------------------------*/ + /* Remove the current ELF file from the list of files that are in the */ + /* process of loading. */ + /*------------------------------------------------------------------------*/ + pHandle->DLIMP_module_dependency_list.size--; + + /*------------------------------------------------------------------------*/ + /* Process relocation entries. */ + /*------------------------------------------------------------------------*/ + fl_handle = relocate_dependency_graph_modules(handle, fd, dyn_module); + + /*------------------------------------------------------------------------*/ + /* With initialization complete, and all relocations having been resolved */ + /* do module initialization. */ + /*------------------------------------------------------------------------*/ + execute_module_initialization(handle); + + return fl_handle; +} + +/*****************************************************************************/ +/* DLOAD_get_entry_names() */ +/* */ +/* Build a list of entry point names for a loaded object. Currently, */ +/* any global symbol in the module is considered a valid entry point */ +/* regardless of whether it is defined in code or associated with a */ +/* data object. We would need to process the content of the symbol */ +/* table entry or its debug information to determine whether it is a */ +/* valid entry point or not. */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_get_entry_names(DLOAD_HANDLE handle, + uint32_t file_handle, + int32_t *entry_pt_cnt, + char ***entry_pt_names) +{ + /*------------------------------------------------------------------------*/ + /* Spin through list of loaded files until we find the file handle we */ + /* are looking for. Then build a list of entry points from that file's */ + /* symbol table. */ + /*------------------------------------------------------------------------*/ + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + loaded_module_ptr_Queue_Node* ptr; + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + { + if (ptr->value->file_handle == file_handle) + { + DLIMP_Loaded_Module *module = ptr->value; + struct Elf32_Sym *symtab; + int i; + + /*------------------------------------------------------------------*/ + /* Any symbol in our file's symbol table is considered a valid */ + /* entry point. */ + /*------------------------------------------------------------------*/ + symtab = (struct Elf32_Sym*)module->gsymtab; + *entry_pt_cnt = module->gsymnum; + *entry_pt_names = DLIF_malloc(*entry_pt_cnt * sizeof(char*)); + for (i = 0; i < module->gsymnum; i++) + { + const char *sym_name = (const char *)symtab[i].st_name; + **entry_pt_names = DLIF_malloc(strlen(sym_name) + 1); + strcpy(**entry_pt_names,sym_name); + } + + return TRUE; + } + } + + /*------------------------------------------------------------------------*/ + /* We didn't find the file we were looking for, return false. */ + /*------------------------------------------------------------------------*/ + return FALSE; +} + +/*****************************************************************************/ +/* DLOAD_prepare_for_execution() */ +/* */ +/* Given a file handle, prepare for execution : */ +/* - Return entry point associated with that module in the *sym_val */ +/* output parameter. */ +/* - Write out the given arguments to the .args section contained in the */ +/* same module. */ +/* - As a test (for the Reference implementation) read the arguments */ +/* using the DLIF_read_arguments() function and set global argc,argv. */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_prepare_for_execution(DLOAD_HANDLE handle, uint32_t file_handle, + TARGET_ADDRESS *sym_val, + int argc, char** argv) +{ + /*------------------------------------------------------------------------*/ + /* Spin through list of loaded files until we find the file handle we */ + /* are looking for. Then return the entry point address associated with */ + /* that module. */ + /*------------------------------------------------------------------------*/ + DLIMP_Loaded_Module *ep_loaded_module; + loaded_module_ptr_Queue_Node* ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + if (ptr->value->file_handle == file_handle) + { + *sym_val = (TARGET_ADDRESS)(ptr->value->entry_point); + ep_loaded_module = ptr->value; + + /*------------------------------------------------------------------*/ + /* Write argc, argv to the .args section in this module. */ + /*------------------------------------------------------------------*/ + if (!write_arguments_to_args_section(handle, argc, argv, + ep_loaded_module)) + { + DLIF_error(DLET_MISC, "Couldn't write to .args section\n"); + return FALSE; + } + + /*------------------------------------------------------------------*/ + /* For the Reference Implementation we simulate a "boot" (rts boot */ + /* routine reads argc, argv from .args), by reading argc, argv from */ + /* .args section. Note that we just wrote these values to the .args */ + /* so this read serves as a test for the Reference Implementation. */ + /*------------------------------------------------------------------*/ + read_args_from_section(ep_loaded_module); + return TRUE; + } + + /*------------------------------------------------------------------------*/ + /* We didn't find the file we were looking for, return false. */ + /*------------------------------------------------------------------------*/ + return FALSE; +} + +/*****************************************************************************/ +/* DLOAD_load_arguments() */ +/* */ +/* Write out the given arguments to the .args section contained in the */ +/* same module. */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_load_arguments(DLOAD_HANDLE handle, uint32_t file_handle, + int argc, char** argv) +{ + /*------------------------------------------------------------------------*/ + /* Spin through list of loaded files until we find the file handle we */ + /* are looking for. Then return the entry point address associated with */ + /* that module. */ + /*------------------------------------------------------------------------*/ + DLIMP_Loaded_Module *ep_loaded_module; + loaded_module_ptr_Queue_Node* ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + if (ptr->value->file_handle == file_handle) + { + ep_loaded_module = ptr->value; + + /*------------------------------------------------------------------*/ + /* Write argc, argv to the .args section in this module. */ + /*------------------------------------------------------------------*/ + if (!write_arguments_to_args_section(handle, argc, argv, + ep_loaded_module)) + { + DLIF_error(DLET_MISC, "Couldn't write to .args section\n"); + return FALSE; + } + } + + /*------------------------------------------------------------------------*/ + /* We didn't find the file we were looking for, return false. */ + /*------------------------------------------------------------------------*/ + return FALSE; +} + +/*****************************************************************************/ +/* DLOAD_get_entry_point() */ +/* */ +/* Given a file handle, return the entry point associated with that */ +/* module in the *sym_val output parameter. */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_get_entry_point(DLOAD_HANDLE handle, uint32_t file_handle, + TARGET_ADDRESS *sym_val) +{ + /*------------------------------------------------------------------------*/ + /* Spin through list of loaded files until we find the file handle we */ + /* are looking for. Then return the entry point address associated with */ + /* that module. */ + /*------------------------------------------------------------------------*/ + loaded_module_ptr_Queue_Node* ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + if (ptr->value->file_handle == file_handle) + { + *sym_val = (TARGET_ADDRESS)(ptr->value->entry_point); + return TRUE; + } + + /*------------------------------------------------------------------------*/ + /* We didn't find the file we were looking for, return false. */ + /*------------------------------------------------------------------------*/ + return FALSE; +} + +/*****************************************************************************/ +/* DLOAD_query_symbol() */ +/* */ +/* Query the value of a global symbol from a specific file. The value */ +/* result will be written to *sym_val. The function returns TRUE if the */ +/* symbol was found, and FALSE if it wasn't. */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_query_symbol(DLOAD_HANDLE handle, + uint32_t file_handle, + const char *sym_name, + TARGET_ADDRESS *sym_val) +{ + /*------------------------------------------------------------------------*/ + /* Spin through list of loaded files until we find the file handle we */ + /* are looking for. Then return the value (target address) associated */ + /* with the symbol we are looking for in that file. */ + /*------------------------------------------------------------------------*/ + loaded_module_ptr_Queue_Node* ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + { + if (ptr->value->file_handle == file_handle) + { + DLIMP_Loaded_Module *module = ptr->value; + struct Elf32_Sym *symtab; + int i; + + /*------------------------------------------------------------------*/ + /* Search through the symbol table by name. */ + /*------------------------------------------------------------------*/ + symtab = (struct Elf32_Sym*)module->gsymtab; + for(i=0; i < module->gsymnum; i++) + { + if (!strcmp(sym_name, (const char *)symtab[i].st_name)) + { + *sym_val = (TARGET_ADDRESS) symtab[i].st_value; + return TRUE; + } + } + } + } + + /*------------------------------------------------------------------------*/ + /* We didn't find the symbol we were looking for, return false. */ + /*------------------------------------------------------------------------*/ + return FALSE; +} + + + +/*****************************************************************************/ +/* unlink_loaded_module() */ +/* */ +/* Unlink a loaded module data object from the list of loaded objects, */ +/* returning a pointer to the object so that it can be deconstructed. */ +/* */ +/*****************************************************************************/ +static DLIMP_Loaded_Module *unlink_loaded_module(DLOAD_HANDLE handle, + loaded_module_ptr_Queue_Node *back_ptr, + loaded_module_ptr_Queue_Node *lm_node) +{ + DLIMP_Loaded_Module *loaded_module = lm_node->value; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + loaded_module_ptr_remove(&pHandle->DLIMP_loaded_objects, lm_node->value); + return loaded_module; +} + +/*****************************************************************************/ +/* execute_module_termination() */ +/* */ +/* Execute termination functions associated with this loaded module. */ +/* Termination functions are called in the reverse order as their */ +/* corresponding initialization functions. */ +/* */ +/*****************************************************************************/ +static void execute_module_termination(DLOAD_HANDLE handle, + DLIMP_Loaded_Module *loaded_module) +{ + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + /*------------------------------------------------------------------------*/ + /* If a DT_FINI_ARRAY dynamic tag was encountered for this module, spin */ + /* through the array in reverse order, calling each function address */ + /* stored in the array. */ + /*------------------------------------------------------------------------*/ + if (loaded_module->fini_arraysz != 0) + { + /*---------------------------------------------------------------------*/ + /* Now make a loader-accessible copy of the .fini_array section. */ + /*---------------------------------------------------------------------*/ + int32_t i; + int32_t num_fini_fcns = + loaded_module->fini_arraysz/sizeof(TARGET_ADDRESS); + TARGET_ADDRESS *fini_array_buf = (TARGET_ADDRESS *) + DLIF_malloc(loaded_module->fini_arraysz); + + DLIF_read(pHandle->client_handle, + fini_array_buf, 1, loaded_module->fini_arraysz, + (TARGET_ADDRESS)loaded_module->fini_array); + + /*---------------------------------------------------------------------*/ + /* Now spin through the array in reverse order, executing each */ + /* termination function whose address occupies an entry in the array. */ + /*---------------------------------------------------------------------*/ + for (i = num_fini_fcns - 1; i >= 0; i--) + DLIF_execute(pHandle->client_handle, + (TARGET_ADDRESS)(fini_array_buf[i])); + + DLIF_free(fini_array_buf); + } + + /*------------------------------------------------------------------------*/ + /* If a DT_FINI dynamic tag was encountered for this module, call the */ + /* function indicated by the tag's value to complete the termination */ + /* process for this module. */ + /*------------------------------------------------------------------------*/ + if (loaded_module->fini != (Elf32_Addr) NULL) + DLIF_execute(pHandle->client_handle, + (TARGET_ADDRESS)loaded_module->fini); +} + +/*****************************************************************************/ +/* remove_loaded_module() */ +/* */ +/* Find and unlink a loaded module data object from the list of loaded */ +/* objects, then call its destructor to free the host memory associated */ +/* with the loaded module and all of its loaded segments. */ +/* */ +/*****************************************************************************/ +static void remove_loaded_module(DLOAD_HANDLE handle, + loaded_module_ptr_Queue_Node *lm_node) +{ + DLIMP_Loaded_Module *lm_object = NULL; + loaded_module_ptr_Queue_Node *back_ptr = NULL; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + if (lm_node != pHandle->DLIMP_loaded_objects.front_ptr) + for (back_ptr = pHandle->DLIMP_loaded_objects.front_ptr; + back_ptr->next_ptr != lm_node; + back_ptr = back_ptr->next_ptr); + + lm_object = unlink_loaded_module(handle, back_ptr, lm_node); + + delete_DLIMP_Loaded_Module(handle, &lm_object); +} + +/*****************************************************************************/ +/* DLOAD_unload() */ +/* */ +/* Unload specified module (identified by its file handle) from target */ +/* memory. Free up any target memory that was allocated for the module's */ +/* segments and also any host heap memory that was allocated for the */ +/* internal module and segment data structures. */ +/* */ +/* Return TRUE if program entry is actually destroyed. This is a way of */ +/* communicating to the client when it needs to actually remove debug */ +/* information associated with this module (so that client does not have */ +/* to maintain a use count that mirrors the program entry). */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_unload(DLOAD_HANDLE handle, uint32_t file_handle) +{ + loaded_module_ptr_Queue_Node* lm_node; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (lm_node = pHandle->DLIMP_loaded_objects.front_ptr; lm_node != NULL; + lm_node = lm_node->next_ptr) + { + if (lm_node->value->file_handle == file_handle) + { + --lm_node->value->use_count; + if (lm_node->value->use_count == 0) + { + DLIMP_Loaded_Module *loaded_module = + (DLIMP_Loaded_Module *)lm_node->value; + int j; + int *dep_file_handles; + + /*---------------------------------------------------------------*/ + /* Termination functions need to be executed in the reverse */ + /* order as the corresponding initialization functions, so */ + /* before we go unload this module's dependents, we need to */ + /* perform the user/global/static termination functions */ + /* associated with this module. */ + /*---------------------------------------------------------------*/ + execute_module_termination(handle, loaded_module); + + /*---------------------------------------------------------------*/ + /* Unload dependent modules via the client. Client needs to know */ + /* when a dependent gets unloaded so that it can update debug */ + /* information. */ + /*---------------------------------------------------------------*/ + dep_file_handles = (int*)(loaded_module->dependencies.buf); + for (j = 0; j < loaded_module->dependencies.size; j++) + DLIF_unload_dependent(pHandle->client_handle, + dep_file_handles[j]); + + /*---------------------------------------------------------------*/ + /* Find the predecessor node of the value we're deleting, */ + /* because its next_ptr will need to be updated. */ + /* */ + /* We can't keep a back pointer around because */ + /* DLIF_unload_dependent() might free that node, making our */ + /* pointer invalid. Turn the Queue template into a doubly */ + /* linked list if this overhead becomes a problem. */ + /*---------------------------------------------------------------*/ + remove_loaded_module(handle, lm_node); + + /*---------------------------------------------------------------*/ + /* Once unloading is done, reset virtual target to NULL. */ + /*---------------------------------------------------------------*/ + cur_target = NULL; + + return TRUE; + } + } + } + + return FALSE; +} + +/*****************************************************************************/ +/* DLOAD_load_symbols() */ +/* */ +/* Load the symbols from the given file and make symbols available for */ +/* global symbol linkage. */ +/* */ +/*****************************************************************************/ +int32_t DLOAD_load_symbols(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd) +{ + DLIMP_Dynamic_Module *dyn_module = new_DLIMP_Dynamic_Module(fd); + DLIMP_Loaded_Module *loaded_module = NULL; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + /*------------------------------------------------------------------------*/ + /* Ensure we have a valid dynamic module object from the constructor. */ + /*------------------------------------------------------------------------*/ + if (!dyn_module) + return 0; + + /*------------------------------------------------------------------------*/ + /* If no access to a program was provided, there is nothing to do. */ + /*------------------------------------------------------------------------*/ + if (!fd) + { + DLIF_error(DLET_FILE, "Missing file specification.\n"); + return 0; + } + + /*------------------------------------------------------------------------*/ + /* Record argc and argv pointers with the dynamic module record. */ + /*------------------------------------------------------------------------*/ + dyn_module->argc = 0; + dyn_module->argv = NULL; + + /*------------------------------------------------------------------------*/ + /* Read file headers and dynamic information into dynamic module. */ + /*------------------------------------------------------------------------*/ + if (!dload_headers(fd, dyn_module)) + { + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + return 0; + } + + /*------------------------------------------------------------------------*/ + /* Find the dynamic segment, if there is one, and read dynamic */ + /* information from the ELF object file into the dynamic module data */ + /* structure associated with this file. */ + /*------------------------------------------------------------------------*/ + if (!dload_dynamic_segment(handle, fd, dyn_module)) + { + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + return 0; + } + + /*------------------------------------------------------------------------*/ + /* Perform sanity checking on the read-in ELF file. */ + /*------------------------------------------------------------------------*/ + if (!is_valid_elf_object_file(fd, dyn_module)) + { + DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n", + dyn_module->name); + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + return 0; + } + + /*------------------------------------------------------------------------*/ + /* Initialize internal ELF module and segment structures. Sets */ + /* loaded_module in *dyn_module. This also deals with assigning a file */ + /* handle and bumping file handle counter. */ + /*------------------------------------------------------------------------*/ + initialize_loaded_module(handle, dyn_module); + + /*------------------------------------------------------------------------*/ + /* Add this module to the loaded module queue. */ + /* Detach the loaded module object from the dynamic module thath created */ + /* it. Ownership of the host memory allocated for the loaded module */ + /* object now belongs to the DLIMP_loaded_objects list. */ + /*------------------------------------------------------------------------*/ + loaded_module_ptr_enqueue(&pHandle->DLIMP_loaded_objects, + dyn_module->loaded_module); + + /*------------------------------------------------------------------------*/ + /* Register a DSBT index request for this module and update its own copy */ + /* of the DSBT with the contents of the client's master DSBT. */ + /*------------------------------------------------------------------------*/ + if (is_dsbt_module(dyn_module)) + { + dynamic_module_ptr_push(&pHandle->DLIMP_dependency_stack, dyn_module); + DLIF_register_dsbt_index_request(handle, + dyn_module->name, + dyn_module->loaded_module->file_handle, + dyn_module->dsbt_index); + DLIF_assign_dsbt_indices(); + DLIF_update_all_dsbts(); + dynamic_module_ptr_pop(&pHandle->DLIMP_dependency_stack); + } + + /*------------------------------------------------------------------------*/ + /* Ownership of the host memory allocated for the loaded module object is */ + /* transferred to the DLIMP_loaded_objects list. Free up the host memory */ + /* for the dynamic module that created the loaded module object. Just */ + /* call the destructor function for DLIMP_Dynamic_Module. */ + /*------------------------------------------------------------------------*/ + loaded_module = detach_loaded_module(dyn_module); + if(loaded_module == NULL) + { + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + return 0; + } + delete_DLIMP_Dynamic_Module(handle, &dyn_module); + + /*------------------------------------------------------------------------*/ + /* Return a file handle so that the client can match this file to an ID. */ + /*------------------------------------------------------------------------*/ + return loaded_module->file_handle; +} + +/*****************************************************************************/ +/* DSBT Support Functions */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* DLOAD_get_dsbt_size() */ +/* */ +/* Find the amount of space allocated for the specified module's DSBT. */ +/* It must be big enough to hold a copy of the master DSBT or the client */ +/* will flag an error. Those modules whose DSBT size is zero are assumed */ +/* to not be using the DSBT model. */ +/* */ +/*****************************************************************************/ +uint32_t DLOAD_get_dsbt_size(DLOAD_HANDLE handle, int32_t file_handle) +{ + dynamic_module_ptr_Stack_Node *ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (ptr = pHandle->DLIMP_dependency_stack.top_ptr; ptr != NULL; + ptr = ptr->next_ptr) + { + DLIMP_Dynamic_Module *dmp = ptr->value; + if (dmp->loaded_module->file_handle == file_handle) + return dmp->dsbt_size; + } + + return 0; +} + +/*****************************************************************************/ +/* DLOAD_get_static_base() */ +/* */ +/* Look up static base symbol associated with the specified module. */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_get_static_base(DLOAD_HANDLE handle, int32_t file_handle, + TARGET_ADDRESS *static_base) +{ + loaded_module_ptr_Queue_Node* ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + { + DLIMP_Loaded_Module *lmp = ptr->value; + if (lmp->file_handle == file_handle) + { + *static_base = (TARGET_ADDRESS)lmp->static_base; + return TRUE; + } + } + + return FALSE; +} + +/*****************************************************************************/ +/* DLOAD_get_dsbt_base() */ +/* */ +/* Look up address of DSBT for the specified module. */ +/* */ +/*****************************************************************************/ +BOOL DLOAD_get_dsbt_base(DLOAD_HANDLE handle, int32_t file_handle, TARGET_ADDRESS *dsbt_base) +{ + dynamic_module_ptr_Stack_Node *ptr; + LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle; + + for (ptr = pHandle->DLIMP_dependency_stack.top_ptr; ptr != NULL; + ptr = ptr->next_ptr) + { + DLIMP_Dynamic_Module *dmp = ptr->value; + if (dmp->loaded_module->file_handle == file_handle) + { + *dsbt_base = + (TARGET_ADDRESS)dmp->dyntab[dmp->dsbt_base_tagidx].d_un.d_ptr; + return TRUE; + } + } + + return FALSE; +} + +/*****************************************************************************/ +/* RELOCATE() - Perform RELA and REL type relocations for given ELF object */ +/* file that we are in the process of loading and relocating. */ +/*****************************************************************************/ +void DLREL_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC* elf_file, + DLIMP_Dynamic_Module* dyn_module) + +{ + cur_target->relocate(handle, elf_file, dyn_module); +} + +/*****************************************************************************/ +/* GET_VT_OBJ() - Once file headers have been read, use the e_machine id to */ +/* figure out the virtul target, so we can access trg specific funcs. */ +/*****************************************************************************/ +static VIRTUAL_TARGET *get_vt_obj(int given_id) +{ + VIRTUAL_TARGET *ptr; + + for(ptr = vt_arr; ptr->machine_id != EM_NONE ; ptr++) + if (ptr->machine_id == given_id) return ptr; + + return NULL; +} + +#if 0 && LOADER_DEBUG // enable to make available in debugger +/*****************************************************************************/ +/* DEBUG_QUEUE() - Debug function. */ +/*****************************************************************************/ +static void debug_queue(LOADER_OBJECT *pHandle, char* position) +{ + loaded_module_ptr_Queue_Node* ptr; + + if (!debugging_on) return; + + DLIF_trace ("\nDEBUG QUEUE : %s, pHandle : 0x%x\n\n", position, + (uint32_t)pHandle); + + for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL; + ptr = ptr->next_ptr) + { + DLIF_trace ("ptr->value->name : %s\n",ptr->value->name); + } + DLIF_trace ("\n"); +} +#endif + +/*****************************************************************************/ +/* READ_ARGS_FROM_SECTION() - This function reads the argc, argv from the */ +/* .args section, and is used to test Reference implementation. */ +/*****************************************************************************/ +static void read_args_from_section(DLIMP_Loaded_Module* ep_module) +{ + /*------------------------------------------------------------------------*/ + /* Before this function in called, the loader has gotten argv/argc from */ + /* the module and written it out to the .args section. c_args points to */ + /* the .args section. */ + /*------------------------------------------------------------------------*/ + ARGS_CONTAINER *pargs = (ARGS_CONTAINER *)(ep_module->c_args); + if (!pargs || pargs == (ARGS_CONTAINER *)0xFFFFFFFF) + { + global_argc = 0; + global_argv = NULL; + } + else + { + global_argc = pargs->argc; + global_argv = pargs->argv; + } +} diff --git a/src/core/dsp/ocl_load/DLOAD/dload.h b/src/core/dsp/ocl_load/DLOAD/dload.h new file mode 100644 index 0000000..bb7d427 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/dload.h @@ -0,0 +1,334 @@ +/* +* dload.h +* +* Define internal data structures used by core dynamic loader. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef DLOAD_H +#define DLOAD_H + +#include "ArrayList.h" +#include "Queue.h" +#include "Stack.h" +#include "elf32.h" +#include "dload_api.h" +#include "util.h" + +/*---------------------------------------------------------------------------*/ +/* Contains strings with names of files the loader is in process of loading. */ +/* This list is used to keep track of what objects are in the process of */ +/* loading while their dependents are being loaded so that we can detect */ +/* circular dependencies. */ +/*---------------------------------------------------------------------------*/ +extern Array_List DLIMP_module_dependency_list; + +/*---------------------------------------------------------------------------*/ +/* DLIMP_Loaded_Segment */ +/* */ +/* This structure represents a segment loaded on memory. */ +/* */ +/* This data structure should be created using host memory when a module */ +/* is being loaded into target memory. The data structure should persist */ +/* as long as the module stays resident in target memory. It should be */ +/* removed when the last use of the module is unloaded from the target. */ +/*---------------------------------------------------------------------------*/ +typedef struct +{ + struct Elf32_Phdr phdr; + Elf32_Addr input_vaddr; /* original segment load addr */ + BOOL modified; + struct DLOAD_MEMORY_SEGMENT *obj_desc; + void * host_address; +} DLIMP_Loaded_Segment; + +/*---------------------------------------------------------------------------*/ +/* DLIMP_Loaded_Module */ +/* */ +/* This structure contains all the information the dynamic loader needs */ +/* to retain after loading an object file's segments into target memory. */ +/* The data structure is created while the object file is being loaded, */ +/* and should persist until the last use of the module is unloaded from */ +/* target memory. */ +/* */ +/* The information contained here is used by the dynamic loader to */ +/* perform dynamic symbol resolution, to track the use count, and to */ +/* finally deallocate the module's segments when the module is unloaded. */ +/*---------------------------------------------------------------------------*/ +typedef struct +{ + char *name; /* Local copy of so_name */ + int32_t file_handle; + int32_t use_count; + Elf32_Addr entry_point; /* Entry point address into module */ + struct Elf32_Sym *gsymtab; /* Module's global symbol table */ + Elf32_Word gsymnum; /* # global symbols */ + char *gstrtab; /* Module's global symbol names */ + Elf32_Word gstrsz; /* Size of global string table */ + Array_List loaded_segments; /* List of DLIMP_Loaded_Segment(s) */ + Array_List dependencies; /* List of dependent file handles */ + BOOL direct_dependent_only; + + Elf32_Addr fini; /* .fini function/section address */ + Elf32_Addr fini_array; /* .fini_array term fcn ary addr */ + int32_t fini_arraysz; /* sizeof .fini_array */ + uint8_t *c_args; /* address of module's .args sect */ + uint8_t *static_base; /* address of module's STATIC_BASE */ + +} DLIMP_Loaded_Module; + +/*---------------------------------------------------------------------------*/ +/* DLIMP_loaded_objects */ +/* */ +/* A list of loaded module objects (DLIMP_Loaded_Module *) that the */ +/* loader has placed into target memory. */ +/*---------------------------------------------------------------------------*/ +TYPE_QUEUE_DEFINITION(DLIMP_Loaded_Module*, loaded_module_ptr) +extern loaded_module_ptr_Queue DLIMP_loaded_objects; + +/*---------------------------------------------------------------------------*/ +/* DLIMP_Dynamic_Module */ +/* */ +/* This structure represents a dynamic module to be loaded by the dynamic */ +/* loader. It contains all the information necessary to load and relocate */ +/* the module. It actually contains most of the headers, dynamic info, */ +/* dynamic symbol table, string table etc. */ +/* */ +/* This structure is allocated in host memory while an ELF object file is */ +/* being loaded and will be destructed after the file has been */ +/* successfully loaded. To simplify loading and relocation of the object */ +/* file's segments, this data structure maintains a link to the loaded */ +/* module. This link is severed when the load is successfully completed. */ +/* The loaded module data structure will persist until the module is */ +/* actually unloaded from target memory, but this data structure will be */ +/* freed. */ +/* */ +/* If the load of the object file is not successful for any reason, then */ +/* the loaded module will not be detached from the dynamic module. In */ +/* such case, the destructor for the dynamic module will assume */ +/* responsibility for freeing any host memory associated with the loaded */ +/* module and its segments. */ +/*---------------------------------------------------------------------------*/ +typedef struct +{ + char *name; /* Local copy of so_name */ + LOADER_FILE_DESC *fd; /* Access to ELF object file */ + struct Elf32_Ehdr fhdr; /* ELF Object File Header */ + struct Elf32_Phdr *phdr; /* ELF Program Header Table */ + Elf32_Word phnum; /* # entries in program header table */ + char* strtab; /* String Table */ + Elf32_Word strsz; /* String Table size in bytes */ + struct Elf32_Dyn *dyntab; /* Elf Dynamic Table (.dynamic scn) */ + /* This contains a list of dynamic */ + /* tags which is terminated by a NULL */ + /* record. */ + struct Elf32_Sym *symtab; /* Elf Dynamic Symbol Table */ + Elf32_Word symnum; /* # symbols in dynamic symbol table */ + Elf32_Word gsymtab_offset;/* Offset into symbol table where */ + /* global symbols start. */ + Elf32_Word gstrtab_offset;/* Offset into string table where */ + /* global symbol names start. */ + + uint8_t *c_args; + uint8_t *static_base; /* address of module's STATIC_BASE */ + int32_t argc; + char **argv; + DLIMP_Loaded_Module *loaded_module; + int32_t wrong_endian; + BOOL direct_dependent_only; + BOOL relocatable; /* TRUE if module can be relocated */ + /* at load-time. FALSE if module is */ + /* a static executable. */ + BOOL relocate_entry_point; /* TRUE if the entry point has */ + /* not been relocated */ + + int32_t dsbt_index; /* DSBT index requested/assigned */ + uint32_t dsbt_size; /* DSBT size for this module */ + int32_t dsbt_base_tagidx;/* Location of DSBT base dyn tag */ + + int32_t preinit_array_idx; /* DT_PREINIT_ARRAY dyn tag loc */ + int32_t preinit_arraysz; /* sizeof pre-init array */ + int32_t init_idx; /* DT_INIT dynamic tag location */ + int32_t init_array_idx; /* DT_INIT_ARRAY dyn tag location */ + int32_t init_arraysz; /* sizeof init array */ + +} DLIMP_Dynamic_Module; + +/*---------------------------------------------------------------------------*/ +/* DLIMP_dependency_stack */ +/* */ +/* A LIFO stack of dynamic module objects (DLIMP_Dynamic_Module *) that */ +/* is retained while dependent files are being loaded and allocated. It */ +/* is used to guide which dynamic modules need to be relocated after all */ +/* items in the dependency graph have been allocated. The stack is only */ +/* used when the client asks the core loader to load a dynamic executable */ +/* or library. When relocation is completed, this stack should be empty. */ +/*---------------------------------------------------------------------------*/ +TYPE_STACK_DEFINITION(DLIMP_Dynamic_Module*, dynamic_module_ptr) +extern dynamic_module_ptr_Stack DLIMP_dependency_stack; + +/*---------------------------------------------------------------------------*/ +/* Private Loader Object instance. */ +/*---------------------------------------------------------------------------*/ +typedef struct +{ + /*-----------------------------------------------------------------------*/ + /* Contains filenames (type const char*) the system is in the process of */ + /* loading. Used to detect cycles in incorrectly compiled ELF binaries. */ + /*-----------------------------------------------------------------------*/ + Array_List DLIMP_module_dependency_list; + + /*-----------------------------------------------------------------------*/ + /* Contains objects (type DLIMP_Loaded_Module) that the system has loaded*/ + /* into target memory. */ + /*-----------------------------------------------------------------------*/ + loaded_module_ptr_Queue DLIMP_loaded_objects; + + /*-----------------------------------------------------------------------*/ + /* Dependency Graph Queue - FIFO queue of dynamic modules that are loaded*/ + /* when client asks to load a dynamic executable or library. Note that */ + /* dependents that have already been loaded with another module will not */ + /* appear on this queue. */ + /*-----------------------------------------------------------------------*/ + dynamic_module_ptr_Stack DLIMP_dependency_stack; + + /*-----------------------------------------------------------------------*/ + /* Counter for generating unique IDs for file handles. */ + /* NOTE: File handle is assigned sequencially but is never reclaimed */ + /* when the modules are unloaded. It is conceivable that a loader*/ + /* running for a long time and loading and unloading modules */ + /* could wrap-around. The loader generates error in this case. */ + /* Presumably each loader instance has a list of file handles, one for */ + /* each file that it loads, and the file handle serves as an index into */ + /* the list. Therefore even if the same file is loaded by two loader */ + /* instances, both loader instances have a different file handle for the */ + /* file - the file is mapped uniquely to it's appopriate file handle per */ + /* loader instance. */ + /*-----------------------------------------------------------------------*/ + int32_t file_handle; + + /*-----------------------------------------------------------------------*/ + /* Client token, passed in via DLOAD_create() */ + /*-----------------------------------------------------------------------*/ + void * client_handle; +} LOADER_OBJECT; + + +/*****************************************************************************/ +/* IF data : Below are the data structures used to store init-fini data. */ +/*****************************************************************************/ +typedef struct +{ + TARGET_ADDRESS sect_addr; + int32_t size; +} +IF_single_record; + +TYPE_QUEUE_DEFINITION(IF_single_record*, IF_table) +extern IF_table_Queue TI_init_table; + + +/*****************************************************************************/ +/* Container used to read in argc, argv from the .srgs section. */ +/*****************************************************************************/ +typedef struct { int argc; char *argv[1]; } ARGS_CONTAINER; + + +/*****************************************************************************/ +/* is_DSBT_module() */ +/* */ +/* return true if the module uses DSBT model */ +/*****************************************************************************/ +static inline BOOL is_dsbt_module(DLIMP_Dynamic_Module *dyn_module) +{ + return (dyn_module->dsbt_size != 0); +} + +/*****************************************************************************/ +/* is_arm_module() */ +/* */ +/* return true if the module being processed is for ARM */ +/*****************************************************************************/ +static inline BOOL is_arm_module(struct Elf32_Ehdr* fhdr) +{ + return fhdr->e_machine == EM_ARM; +} + +/*****************************************************************************/ +/* is_c60_module() */ +/* */ +/* return true if the module being processed is for C60 */ +/*****************************************************************************/ +static inline BOOL is_c60_module(struct Elf32_Ehdr* fhdr) +{ + return fhdr->e_machine == EM_TI_C6000; +} + +/*---------------------------------------------------------------------------*/ +/* DLIMP_update_dyntag_section_address() */ +/* */ +/* Given the index of a dynamic tag which we happen to know points to a */ +/* section address, find the program header table entry associated with */ +/* the specified address and update the tag value with the real address */ +/* of the section. */ +/* */ +/*---------------------------------------------------------------------------*/ +extern BOOL DLIMP_update_dyntag_section_address(DLIMP_Dynamic_Module *dyn_module, + int32_t i); + +extern uint32_t DLIMP_get_first_dyntag(int tag, struct Elf32_Dyn* dyn_table); + +/*---------------------------------------------------------------------------*/ +/* Global flags to help manage internal debug and profiling efforts. */ +/*---------------------------------------------------------------------------*/ +#ifndef __TI_COMPILER_VERSION__ +#define LOADER_DEBUG 1 +#else +#define LOADER_DEBUG 0 +#endif + +#undef LOADER_DEBUG + +#define LOADER_DEBUG 1 +#define LOADER_PROFILE 1 + +#if LOADER_DEBUG +extern BOOL debugging_on; +#endif + +#if LOADER_DEBUG || LOADER_PROFILE +extern BOOL profiling_on; +#endif + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/dload_endian.c b/src/core/dsp/ocl_load/DLOAD/dload_endian.c new file mode 100644 index 0000000..ac6413b --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/dload_endian.c @@ -0,0 +1,151 @@ +/* +* dload_endian.c +* +* Simple helper functions to assist core loader with endian-ness issues +* when the host endian-ness may be opposite the endian-ness of the target. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include "dload_endian.h" + +/*****************************************************************************/ +/* DLIMP_GET_ENDIAN() - Determine endianness of the host. Uses ELF */ +/* endianness constants. */ +/*****************************************************************************/ +int DLIMP_get_endian() +{ + int32_t x = 0x1; + + if (*((int16_t*)(&x))) return ELFDATA2LSB; + + return ELFDATA2MSB; +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_ENDIAN32() - Swap endianness of a 32-bit integer. */ +/*****************************************************************************/ +void DLIMP_change_endian32(int32_t* to_change) +{ + int32_t temp = 0; + temp += (*to_change & 0x000000FF) << 24; + temp += (*to_change & 0x0000FF00) << 8; + temp += (*to_change & 0x00FF0000) >> 8; + temp += (*to_change & 0xFF000000) >> 24; + *to_change = temp; +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_ENDIAN16() - Swap endianness of a 16-bit integer. */ +/*****************************************************************************/ +void DLIMP_change_endian16(int16_t* to_change) +{ + int16_t temp = 0; + temp += (*to_change & 0x00FF) << 8; + temp += (*to_change & 0xFF00) >> 8; + *to_change = temp; +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_EHDR_ENDIAN() - Swap endianness of an ELF file header. */ +/*****************************************************************************/ +void DLIMP_change_ehdr_endian(struct Elf32_Ehdr* ehdr) +{ + DLIMP_change_endian16((int16_t*)(&ehdr->e_type)); + DLIMP_change_endian16((int16_t*)(&ehdr->e_machine)); + DLIMP_change_endian32((int32_t*)(&ehdr->e_version)); + DLIMP_change_endian32((int32_t*)(&ehdr->e_entry)); + DLIMP_change_endian32((int32_t*)(&ehdr->e_phoff)); + DLIMP_change_endian32((int32_t*)(&ehdr->e_shoff)); + DLIMP_change_endian32((int32_t*)(&ehdr->e_flags)); + DLIMP_change_endian16((int16_t*)(&ehdr->e_ehsize)); + DLIMP_change_endian16((int16_t*)(&ehdr->e_phentsize)); + DLIMP_change_endian16((int16_t*)(&ehdr->e_phnum)); + DLIMP_change_endian16((int16_t*)(&ehdr->e_shentsize)); + DLIMP_change_endian16((int16_t*)(&ehdr->e_shnum)); + DLIMP_change_endian16((int16_t*)(&ehdr->e_shstrndx)); +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_PHDR_ENDIAN() - Swap endianness of an ELF program header. */ +/*****************************************************************************/ +void DLIMP_change_phdr_endian(struct Elf32_Phdr* phdr) +{ + DLIMP_change_endian32((int32_t*)(&phdr->p_type)); + DLIMP_change_endian32((int32_t*)(&phdr->p_offset)); + DLIMP_change_endian32((int32_t*)(&phdr->p_vaddr)); + DLIMP_change_endian32((int32_t*)(&phdr->p_paddr)); + DLIMP_change_endian32((int32_t*)(&phdr->p_filesz)); + DLIMP_change_endian32((int32_t*)(&phdr->p_memsz)); + DLIMP_change_endian32((int32_t*)(&phdr->p_flags)); + DLIMP_change_endian32((int32_t*)(&phdr->p_align)); +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_DYNENT_ENDIAN() - Swap endianness of a dynamic table entry. */ +/*****************************************************************************/ +void DLIMP_change_dynent_endian(struct Elf32_Dyn* dyn) +{ + DLIMP_change_endian32((int32_t*)(&dyn->d_tag)); + DLIMP_change_endian32((int32_t*)(&dyn->d_un.d_val)); +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_SYM_ENDIAN() - Swap endianness of an ELF symbol table entry. */ +/*****************************************************************************/ +void DLIMP_change_sym_endian(struct Elf32_Sym* sym) +{ + DLIMP_change_endian32((int32_t*)(&sym->st_name)); + DLIMP_change_endian32((int32_t*)(&sym->st_value)); + DLIMP_change_endian32((int32_t*)(&sym->st_size)); + DLIMP_change_endian16((int16_t*)(&sym->st_shndx)); +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_RELA_ENDIAN() - Swap endianness of a RELA-type relocation. */ +/*****************************************************************************/ +void DLIMP_change_rela_endian(struct Elf32_Rela* ra) +{ + DLIMP_change_endian32((int32_t*)(&ra->r_offset)); + DLIMP_change_endian32((int32_t*)(&ra->r_info)); + DLIMP_change_endian32((int32_t*)(&ra->r_addend)); +} + +/*****************************************************************************/ +/* DLIMP_CHANGE_REL_ENDIAN() - Swap endianness of a REL-type relocation. */ +/*****************************************************************************/ +void DLIMP_change_rel_endian(struct Elf32_Rel* r) +{ + DLIMP_change_endian32((int32_t*)(&r->r_offset)); + DLIMP_change_endian32((int32_t*)(&r->r_info)); +} diff --git a/src/core/dsp/ocl_load/DLOAD/dload_endian.h b/src/core/dsp/ocl_load/DLOAD/dload_endian.h new file mode 100644 index 0000000..ee74e11 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/dload_endian.h @@ -0,0 +1,58 @@ +/* +* dload_endian.h +* +* Specification of functions used to assist loader with endian-ness issues. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef DLOAD_ENDIAN_H +#define DLOAD_ENDIAN_H + +#include "elf32.h" + +/*---------------------------------------------------------------------------*/ +/* Prototypes for ELF file object reader endianness swap routines. */ +/*---------------------------------------------------------------------------*/ + +int DLIMP_get_endian(void); +void DLIMP_change_endian32(int32_t* to_change); +void DLIMP_change_endian16(int16_t* to_change); +void DLIMP_change_ehdr_endian(struct Elf32_Ehdr* to_change); +void DLIMP_change_phdr_endian(struct Elf32_Phdr* to_change); +void DLIMP_change_dynent_endian(struct Elf32_Dyn* to_change); +void DLIMP_change_sym_endian(struct Elf32_Sym* to_change); +void DLIMP_change_rela_endian(struct Elf32_Rela* to_change); +void DLIMP_change_rel_endian(struct Elf32_Rel* to_change); + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/elf32.c b/src/core/dsp/ocl_load/DLOAD/elf32.c new file mode 100644 index 0000000..082ba01 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/elf32.c @@ -0,0 +1,652 @@ +/* +* elf32.c +* +* Basic Data Structures for 32-Bit ELF Object Format Files +* +* The data structures in this file come primarily from this specification: +* +* Tool Interface Standard (TIS) +* Executable and Linking Format (ELF) Specification +* Version 1.2 +* +* TIS Committee +* May 1995 +* +* Additions and enhancements from this specification are also included: +* +* System V Application Binary Interface +* DRAFT 17 +* December 2003 +* +* http://sco.com/developers/gabi/2003-12-17/contents.html +* +* This is a C implementation of the data base objects that are commonly +* used in the source for TI development tools that support ELF. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include "elf32.h" + +/*---------------------------------------------------------------------------*/ +/* Dynamic Tag Database */ +/*---------------------------------------------------------------------------*/ + +const struct EDYN_TAG EDYN_TAG_DB[] = +{ + /* EDYN_TAG_NULL */ + { + /* d_tag_name */ "DT_NULL", + /* d_tag_value */ DT_NULL, + /* d_untype */ EDYN_UNTYPE_IGNORED, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_MANDATORY + }, + + /* EDYN_TAG_NEEDED */ + { + /* d_tag_name */ "DT_NEEDED", + /* d_tag_value */ DT_NEEDED, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_PLTRELSZ */ + { + /* d_tag_name */ "DT_PLTRELSZ", + /* d_tag_value */ DT_PLTRELSZ, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_PLTGOT */ + { + /* d_tag_name */ "DT_PLTGOT", + /* d_tag_value */ DT_PLTGOT, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_HASH */ + { + /* d_tag_name */ "DT_HASH", + /* d_tag_value */ DT_HASH, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_MANDATORY + }, + + /* EDYN_TAG_STRTAB */ + { + /* d_tag_name */ "DT_STRTAB", + /* d_tag_value */ DT_STRTAB, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_MANDATORY + }, + + /* EDYN_TAG_SYMTAB */ + { + /* d_tag_name */ "DT_SYMTAB", + /* d_tag_value */ DT_SYMTAB, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_MANDATORY + }, + + /* EDYN_TAG_RELA */ + { + /* d_tag_name */ "DT_RELA", + /* d_tag_value */ DT_RELA, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_RELASZ */ + { + /* d_tag_name */ "DT_RELASZ", + /* d_tag_value */ DT_RELASZ, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_RELAENT */ + { + /* d_tag_name */ "DT_RELAENT", + /* d_tag_value */ DT_RELAENT, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_STRSZ */ + { + /* d_tag_name */ "DT_STRSZ", + /* d_tag_value */ DT_STRSZ, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_MANDATORY + }, + + /* EDYN_TAG_SYMENT */ + { + /* d_tag_name */ "DT_SYMENT", + /* d_tag_value */ DT_SYMENT, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_MANDATORY + }, + + /* EDYN_TAG_INIT */ + { + /* d_tag_name */ "DT_INIT", + /* d_tag_value */ DT_INIT, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_FINI */ + { + /* d_tag_name */ "DT_FINI", + /* d_tag_value */ DT_FINI, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_SONAME */ + { + /* d_tag_name */ "DT_SONAME", + /* d_tag_value */ DT_SONAME, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_IGNORED, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_RPATH */ + { + /* d_tag_name */ "DT_RPATH", + /* d_tag_value */ DT_RPATH, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_IGNORED + }, + + /* EDYN_TAG_SYMBOLIC */ + { + /* d_tag_name */ "DT_SYMBOLIC", + /* d_tag_value */ DT_SYMBOLIC, + /* d_untype */ EDYN_UNTYPE_IGNORED, + /* d_exec_req */ EDYN_TAGREQ_IGNORED, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_REL */ + { + /* d_tag_name */ "DT_REL", + /* d_tag_value */ DT_REL, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_RELSZ */ + { + /* d_tag_name */ "DT_RELSZ", + /* d_tag_value */ DT_RELSZ, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_RELENT */ + { + /* d_tag_name */ "DT_RELENT", + /* d_tag_value */ DT_RELENT, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_MANDATORY, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_PLTREL */ + { + /* d_tag_name */ "DT_PLTREL", + /* d_tag_value */ DT_PLTREL, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_DEBUG */ + { + /* d_tag_name */ "DT_DEBUG", + /* d_tag_value */ DT_DEBUG, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_IGNORED + }, + + /* EDYN_TAG_TEXTREL */ + { + /* d_tag_name */ "DT_TEXTREL", + /* d_tag_value */ DT_TEXTREL, + /* d_untype */ EDYN_UNTYPE_IGNORED, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_JMPREL */ + { + /* d_tag_name */ "DT_JMPREL", + /* d_tag_value */ DT_JMPREL, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_BIND_NOW */ + { + /* d_tag_name */ "DT_BIND_NOW", + /* d_tag_value */ DT_BIND_NOW, + /* d_untype */ EDYN_UNTYPE_IGNORED, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_INIT_ARRAY */ + { + /* d_tag_name */ "DT_INIT_ARRAY", + /* d_tag_value */ DT_INIT_ARRAY, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_FINI_ARRAY */ + { + /* d_tag_name */ "DT_FINI_ARRAY", + /* d_tag_value */ DT_FINI_ARRAY, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_INIT_ARRAYSZ */ + { + /* d_tag_name */ "DT_INIT_ARRAYSZ", + /* d_tag_value */ DT_INIT_ARRAYSZ, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_FINI_ARRAYSZ */ + { + /* d_tag_name */ "DT_FINI_ARRAYSZ", + /* d_tag_value */ DT_FINI_ARRAYSZ, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_RUNPATH */ + { + /* d_tag_name */ "DT_RUNPATH", + /* d_tag_value */ DT_RUNPATH, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_FLAGS */ + { + /* d_tag_name */ "DT_FLAGS", + /* d_tag_value */ DT_FLAGS, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_OPTIONAL + }, + + /* EDYN_TAG_ENCODING */ + { + /* d_tag_name */ "DT_ENCODING", + /* d_tag_value */ DT_ENCODING, + /* d_untype */ EDYN_UNTYPE_UNSPECIFIED, + /* d_exec_req */ EDYN_TAGREQ_UNSPECIFIED, + /* d_shared_req */ EDYN_TAGREQ_UNSPECIFIED + }, + + /* EDYN_TAG_PREINIT_ARRAY */ + { + /* d_tag_name */ "DT_PREINIT_ARRAY", + /* d_tag_value */ DT_PREINIT_ARRAY, + /* d_untype */ EDYN_UNTYPE_PTR, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_IGNORED + }, + + /* EDYN_TAG_PREINIT_ARRAYSZ */ + { + /* d_tag_name */ "DT_PREINIT_ARRAYSZ", + /* d_tag_value */ DT_PREINIT_ARRAYSZ, + /* d_untype */ EDYN_UNTYPE_VAL, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_IGNORED + }, + + /* Terminate array with an id of -1 */ + { + /* d_tag_name */ "", + /* d_tag_value */ -1, + /* d_untype */ EDYN_UNTYPE_UNSPECIFIED, + /* d_exec_req */ EDYN_TAGREQ_OPTIONAL, + /* d_shared_req */ EDYN_TAGREQ_IGNORED + } +}; + +/*---------------------------------------------------------------------------*/ +/* Special Section Database */ +/*---------------------------------------------------------------------------*/ +const struct ESCN ESCN_DB[] = +{ + /* .bss */ + { + /* name */ ESCN_BSS_name, + /* sh_type */ SHT_NOBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + }, + + /* .comment */ + { + /* name */ ESCN_COMMENT_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, + + /* .data */ + { + /* name */ ESCN_DATA_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + }, + + /* .data1 */ + { + /* name */ ESCN_DATA1_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + }, + + /* .debug */ + { + /* name */ ESCN_DEBUG_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, + + /* .dynamic */ + { + /* name */ ESCN_DYNAMIC_name, + /* sh_type */ SHT_DYNAMIC, + /* sh_entsize */ sizeof(struct Elf32_Dyn), + /* sh_flags */ SHF_ALLOC + }, + + /* .dynstr */ + { + /* name */ ESCN_DYNSTR_name, + /* sh_type */ SHT_STRTAB, + /* sh_entsize */ sizeof(char), + /* sh_flags */ SHF_ALLOC + SHF_STRINGS + }, + + /* .dynsym */ + { + /* name */ ESCN_DYNSYM_name, + /* sh_type */ SHT_DYNSYM, + /* sh_entsize */ sizeof(struct Elf32_Sym), + /* sh_flags */ SHF_ALLOC + }, + + /* .fini */ + { + /* name */ ESCN_FINI_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR + }, + + /* .fini_array */ + { + /* name */ ESCN_FINI_ARRAY_name, + /* sh_type */ SHT_FINI_ARRAY, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + }, + + /* .got */ + { + /* name */ ESCN_GOT_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, + + /* .hash */ + { + /* name */ ESCN_HASH_name, + /* sh_type */ SHT_HASH, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + }, + + /* .init */ + { + /* name */ ESCN_INIT_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR + }, + + /* .init_array */ + { + /* name */ ESCN_INIT_ARRAY_name, + /* sh_type */ SHT_INIT_ARRAY, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + }, + + /* .interp */ + { + /* name */ ESCN_INTERP_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, + + /* .line */ + { + /* name */ ESCN_LINE_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, + + /* .note */ + { + /* name */ ESCN_NOTE_name, + /* sh_type */ SHT_NOTE, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, + + /* .plt */ + { + /* name */ ESCN_PLT_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, + + /* .preinit_array */ + { + /* name */ ESCN_PREINIT_ARRAY_name, + /* sh_type */ SHT_PREINIT_ARRAY, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + }, + + /* .rel */ + { + /* name */ ESCN_REL_name, + /* sh_type */ SHT_REL, + /* sh_entsize */ sizeof(struct Elf32_Rel), + /* sh_flags */ 0 + }, + + /* .rela */ + { + /* name */ ESCN_RELA_name, + /* sh_type */ SHT_RELA, + /* sh_entsize */ sizeof(struct Elf32_Rela), + /* sh_flags */ 0 + }, + + /* .rodata */ + { + /* name */ ESCN_RODATA_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + }, + + /* .rodata1 */ + { + /* name */ ESCN_RODATA1_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + }, + + /* .shstrtab */ + { + /* name */ ESCN_SHSTRTAB_name, + /* sh_type */ SHT_STRTAB, + /* sh_entsize */ sizeof(char), + /* sh_flags */ SHF_STRINGS + }, + + /* .strtab */ + { + /* name */ ESCN_STRTAB_name, + /* sh_type */ SHT_STRTAB, + /* sh_entsize */ sizeof(char), + /* sh_flags */ SHF_STRINGS + }, + + /* .symtab */ + { + /* name */ ESCN_SYMTAB_name, + /* sh_type */ SHT_SYMTAB, + /* sh_entsize */ sizeof(struct Elf32_Sym), + /* sh_flags */ 0 + }, + + /* .symtab_shndx */ + { + /* name */ ESCN_SYMTAB_SHNDX_name, + /* sh_type */ SHT_SYMTAB_SHNDX, + /* sh_entsize */ sizeof(Elf32_Word), + /* sh_flags */ 0 + }, + + /* .tbss */ + { + /* name */ ESCN_TBSS_name, + /* sh_type */ SHT_NOBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS + }, + + /* .tdata */ + { + /* name */ ESCN_TDATA_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS + }, + + /* .tdata1 */ + { + /* name */ ESCN_TDATA1_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS + }, + + /* .text */ + { + /* name */ ESCN_TEXT_name, + /* sh_type */ SHT_PROGBITS, + /* sh_entsize */ 0, + /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR + }, +#if 0 + /* .build.attributes */ + { + /* name */ ESCN_ATTRIBUTES_name, + /* sh_type */ SHT_ATTRIBUTES, + /* sh_entsize */ 0, + /* sh_flags */ 0 + }, +#endif + /* Terminate array with a NULL name field */ + { + /* name */ (const char*)0, + /* sh_type */ 0, + /* sh_entsize */ 0, + /* sh_flags */ 0 + } +}; + diff --git a/src/core/dsp/ocl_load/DLOAD/elf32.h b/src/core/dsp/ocl_load/DLOAD/elf32.h new file mode 100644 index 0000000..67358d6 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/elf32.h @@ -0,0 +1,756 @@ +/* +* elf32.h +* +* Basic Data Structures for 32-bit ELF Object Format Files +* +* The data structures in this file come primarily from this specification: +* +* Tool Interface Standard (TIS) +* Executable and Linking Format (ELF) Specification +* Version 1.2 +* +* TIS Committee +* May 1995 +* +* Additions and enhancements from this specification are also included: +* +* System V Application Binary Interface +* DRAFT 17 +* December 2003 +* +* http://sco.com/developers/gabi/2003-12-17/contents.html +* +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef ELF32_H +#define ELF32_H + +#include <inttypes.h> + +/*---------------------------------------------------------------------------*/ +/* 32-Bit Data Types (Figure 1-2, page 1-2) */ +/*---------------------------------------------------------------------------*/ +typedef uint32_t Elf32_Addr; +typedef uint16_t Elf32_Half; +typedef uint32_t Elf32_Off; +typedef int32_t Elf32_Sword; +typedef uint32_t Elf32_Word; + + +/*****************************************************************************/ +/* ELF Header */ +/* PP. 1-4 */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* ELF Identification Indexes (indexes into Elf32_Ehdr.e_ident[] below) */ +/*---------------------------------------------------------------------------*/ +enum +{ + EI_MAG0 = 0, /* File identification */ + EI_MAG1 = 1, /* File identification */ + EI_MAG2 = 2, /* File identification */ + EI_MAG3 = 3, /* File identification */ + EI_CLASS = 4, /* File class */ + EI_DATA = 5, /* Data encoding */ + EI_VERSION = 6, /* File version */ + EI_OSABI = 7, /* Operating system / ABI */ + EI_ABIVERSION = 8, /* ABI version */ + EI_PAD = 9, /* Start of padding bytes */ + EI_NIDENT = 16 /* Size of Elf32_Ehdr.e_ident[] */ +}; + + +/*---------------------------------------------------------------------------*/ +/* ELF Header Data Structure */ +/*---------------------------------------------------------------------------*/ +struct Elf32_Ehdr +{ + uint8_t e_ident[EI_NIDENT]; /* ELF Magic Number */ + Elf32_Half e_type; /* Object File Type */ + Elf32_Half e_machine; /* Target Processor */ + Elf32_Word e_version; /* Object File Version */ + Elf32_Addr e_entry; /* Entry Point */ + Elf32_Off e_phoff; /* Program Header Table Offset */ + Elf32_Off e_shoff; /* Section Header Table Offset */ + Elf32_Word e_flags; /* Processor-Specific Flags */ + Elf32_Half e_ehsize; /* Size of ELF header */ + Elf32_Half e_phentsize; /* Size of a Program Header */ + Elf32_Half e_phnum; /* # Entries in Program Header Table */ + Elf32_Half e_shentsize; /* Size of a Section Header */ + Elf32_Half e_shnum; /* # Entries in Section Header Table */ + Elf32_Half e_shstrndx; /* Section Name String Table Section */ +}; + + +/*---------------------------------------------------------------------------*/ +/* Object File Types (value of "e_type") */ +/*---------------------------------------------------------------------------*/ +enum +{ + ET_NONE = 0, /* No file type */ + ET_REL = 1, /* Relocatable file */ + ET_EXEC = 2, /* Executable file */ + ET_DYN = 3, /* Shared object file */ + ET_CORE = 4, /* Core file */ + ET_LOOS = 0xfe00, /* First OS-specific value */ + ET_HIPS = 0xfeff, /* Last OS-specific value */ + ET_LOPROC = 0xff00, /* First processor-specific value */ + ET_HIPROC = 0xffff /* Last processor-specific value */ +}; + + +/*---------------------------------------------------------------------------*/ +/* Target Processors (value of "e_machine") */ +/*---------------------------------------------------------------------------*/ +enum +{ + EM_NONE = 0, /* No machine */ + EM_M32 = 1, /* AT&T WE 32100 */ + EM_SPARC = 2, /* SPARC */ + EM_386 = 3, /* Intel 80386 */ + EM_68K = 4, /* Motorola 68000 */ + EM_88K = 5, /* Motorola 88000 */ + EM_860 = 7, /* Intel 80860 */ + EM_MIPS = 8, /* MIPS I Architecture */ + EM_S370 = 9, /* IBM System/370 Processor */ + EM_MIPS_RS3_LE = 10, /* MIPS RS3000 Little-endian */ + EM_PARISC = 15, /* Hewlett-Packard PA-RISC */ + EM_VPP500 = 17, /* Fujitsu VPP500 */ + EM_SPARC32PLUS = 18, /* Enhanced instruction set SPARC */ + EM_960 = 19, /* Intel 80960 */ + EM_PPC = 20, /* PowerPC */ + EM_PPC64 = 21, /* 64-bit PowerPC */ + EM_S390 = 22, /* IBM System/390 Processor */ + EM_V800 = 36, /* NEC V800 */ + EM_FR20 = 37, /* Fujitsu FR20 */ + EM_RH32 = 38, /* TRW RH-32 */ + EM_RCE = 39, /* Motorola RCE */ + EM_ARM = 40, /* Advanced RISC Machines ARM */ + EM_ALPHA = 41, /* Digital Alpha */ + EM_SH = 42, /* Hitachi SH */ + EM_SPARCV9 = 43, /* SPARC Version 9 */ + EM_TRICORE = 44, /* Siemens TriCore embedded processor */ + EM_ARC = 45, /* "Argonaut RISC Core, Argonaut Technologies Inc. */ + EM_H8_300 = 46, /* Hitachi H8/300 */ + EM_H8_300H = 47, /* Hitachi H8/300H */ + EM_H8S = 48, /* Hitachi H8S */ + EM_H8_500 = 49, /* Hitachi H8/500 */ + EM_IA_64 = 50, /* Intel IA-64 processor architecture */ + EM_MIPS_X = 51, /* Stanford MIPS-X */ + EM_COLDFIRE = 52, /* Motorola ColdFire */ + EM_68HC12 = 53, /* Motorola M68HC12 */ + EM_MMA = 54, /* Fujitsu MMA Multimedia Accelerator */ + EM_PCP = 55, /* Siemens PCP */ + EM_NCPU = 56, /* Sony nCPU embedded RISC processor */ + EM_NDR1 = 57, /* Denso NDR1 microprocessor */ + EM_STARCORE = 58, /* Motorola Star*Core processor */ + EM_ME16 = 59, /* Toyota ME16 processor */ + EM_ST100 = 60, /* STMicroelectronics ST100 processor */ + EM_TINYJ = 61, /* Advanced Logic Corp. TinyJ embedded processor f */ + EM_X86_64 = 62, /* AMD x86-64 architecture */ + EM_PDSP = 63, /* Sony DSP Processor */ + EM_PDP10 = 64, /* Digital Equipment Corp. PDP-10 */ + EM_PDP11 = 65, /* Digital Equipment Corp. PDP-11 */ + EM_FX66 = 66, /* Siemens FX66 microcontroller */ + EM_ST9PLUS = 67, /* STMicroelectronics ST9+ 8/16 bit microcontrolle */ + EM_ST7 = 68, /* STMicroelectronics ST7 8-bit microcontroller */ + EM_68HC16 = 69, /* Motorola MC68HC16 Microcontroller */ + EM_68HC11 = 70, /* Motorola MC68HC11 Microcontroller */ + EM_68HC08 = 71, /* Motorola MC68HC08 Microcontroller */ + EM_68HC05 = 72, /* Motorola MC68HC05 Microcontroller */ + EM_SVX = 73, /* Silicon Graphics SVx */ + EM_ST19 = 74, /* STMicroelectronics ST19 8-bit microcontroller */ + EM_VAX = 75, /* Digital VAX */ + EM_CRIS = 76, /* Axis Communications 32-bit embedded processor */ + EM_JAVELIN = 77, /* Infineon Technologies 32-bit embedded processor */ + EM_FIREPATH = 78, /* Element 14 64-bit DSP Processor */ + EM_ZSP = 79, /* LSI Logic 16-bit DSP Processor */ + EM_MMIX = 80, /* Donald Knuth's educational 64-bit processor */ + EM_HUANY = 81, /* Harvard University machine-independent object f */ + EM_PRISM = 82, /* SiTera Prism */ + EM_AVR = 83, /* Atmel AVR 8-bit microcontroller */ + EM_FR30 = 84, /* Fujitsu FR30 */ + EM_D10V = 85, /* Mitsubishi D10V */ + EM_D30V = 86, /* Mitsubishi D30V */ + EM_V850 = 87, /* NEC v850 */ + EM_M32R = 88, /* Mitsubishi M32R */ + EM_MN10300 = 89, /* Matsushita MN10300 */ + EM_MN10200 = 90, /* Matsushita MN10200 */ + EM_PJ = 91, /* picoJava */ + EM_OPENRISC = 92, /* OpenRISC 32-bit embedded processor */ + EM_ARC_A5 = 93, /* ARC Cores Tangent-A5 */ + EM_XTENSA = 94, /* Tensilica Xtensa Architecture */ + EM_VIDEOCORE = 95, /* Alphamosaic VideoCore processor */ + EM_TMM_GPP = 96, /* Thompson Multimedia General Purpose Processor */ + EM_NS32K = 97, /* National Semiconductor 32000 series */ + EM_TPC = 98, /* Tenor Network TPC processor */ + EM_SNP1K = 99, /* Trebia SNP 1000 processor */ + EM_ST200 = 100, /* STMicroelectronics (www.st.com) ST200 microcont */ + EM_IP2K = 101, /* Ubicom IP2xxx microcontroller family */ + EM_MAX = 102, /* MAX Processor */ + EM_CR = 103, /* National Semiconductor CompactRISC microprocess */ + EM_F2MC16 = 104, /* Fujitsu F2MC16 */ + EM_MSP430 = 105, /* Texas Instruments embedded microcontroller msp4 */ + EM_BLACKFIN = 106, /* Analog Devices Blackfin (DSP) processor */ + EM_SE_C33 = 107, /* S1C33 Family of Seiko Epson processors */ + EM_SEP = 108, /* Sharp embedded microprocessor */ + EM_ARCA = 109, /* Arca RISC Microprocessor */ + EM_UNICORE = 110, /* Microprocessor series from PKU-Unity Ltd. and M */ + + /*------------------------------------------------------------------------*/ + /* ELF Magic Numbers Reserved For Texas Instruments */ + /* */ + /* The magic numbers 140-159 were reserved through SCO to be included */ + /* in the official ELF specification. Please see Don Darling */ + /* regarding any changes or allocation of the numbers below. */ + /* */ + /* When we allocate a number for use, SCO needs to be notified so they */ + /* can update the ELF specification accordingly. */ + /*------------------------------------------------------------------------*/ + EM_TI_C6000 = 140, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED02 = 141, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED03 = 142, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED04 = 143, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED05 = 144, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED06 = 145, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED07 = 146, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED08 = 147, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED09 = 148, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED10 = 149, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED11 = 150, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED12 = 151, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED13 = 152, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED14 = 153, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED15 = 154, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED16 = 155, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED17 = 156, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED18 = 157, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED19 = 158, /* Reserved for Texas Instruments; unused */ + EM_TI_UNUSED20 = 159 /* Reserved for Texas Instruments; unused */ +}; + + +/*---------------------------------------------------------------------------*/ +/* Object File Version (value of "e_version") */ +/*---------------------------------------------------------------------------*/ +enum +{ + EV_NONE = 0, /* Invalid version */ + EV_CURRENT = 1 /* Current version */ +}; + + +/*****************************************************************************/ +/* ELF Identification */ +/* PP. 1-6 */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* Identification Values for ELF Files */ +/*---------------------------------------------------------------------------*/ + +/* EI_MAG0 to EI_MAG3 */ +enum +{ + ELFMAG0 = 0x7f, /* e_ident[EI_MAG0] */ + ELFMAG1 = 'E', /* e_ident[EI_MAG1] */ + ELFMAG2 = 'L', /* e_ident[EI_MAG2] */ + ELFMAG3 = 'F' /* e_ident[EI_MAG3] */ +}; + +/* EI_CLASS */ +enum +{ + ELFCLASSNONE = 0, /* Invalid class */ + ELFCLASS32 = 1, /* 32-bit objects */ + ELFCLASS64 = 2 /* 64-bit objects */ +}; + +/* EI_DATA */ +enum +{ + ELFDATANONE = 0, /* Invalid data encoding */ + ELFDATA2LSB = 1, /* Little-endian data */ + ELFDATA2MSB = 2 /* Big-endian data */ +}; + +/* EI_OSABI */ +enum +{ + ELFOSABI_NONE = 0, /* No extensions or unspecified */ + ELFOSABI_HPUX = 1, /* Hewlett-Packard HP-UX */ + ELFOSABI_NETBSD = 2, /* NetBSD */ + ELFOSABI_LINUX = 3, /* Linux */ + ELFOSABI_SOLARIS = 6, /* Sun Solaris */ + ELFOSABI_AIX = 7, /* AIX */ + ELFOSABI_IRIX = 8, /* IRIX */ + ELFOSABI_FREEBSD = 9, /* FreeBSD */ + ELFOSABI_TRU64 = 10, /* Compaq TRU64 UNIX */ + ELFOSABI_MODESTO = 11, /* Novell Modesto */ + ELFOSABI_OPENBSD = 12, /* Open BSD */ + ELFOSABI_OPENVMS = 13, /* Open VMS */ + ELFOSABI_NSK = 14, /* Hewlett-Packard Non-Stop Kernel */ + ELFOSABI_AROS = 15 /* Amiga Research OS */ +}; + +/*****************************************************************************/ +/* Program Header */ +/* PP. 2-2 */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* Program Header Data Structure */ +/*---------------------------------------------------------------------------*/ +struct Elf32_Phdr +{ + Elf32_Word p_type; /* Segment type */ + Elf32_Off p_offset; /* Segment file offset */ + Elf32_Addr p_vaddr; /* Segment virtual address */ + Elf32_Addr p_paddr; /* Segment physical address */ + Elf32_Word p_filesz; /* Segment file image size */ + Elf32_Word p_memsz; /* Segment memory image size */ + Elf32_Word p_flags; /* Segment flags */ + Elf32_Word p_align; /* Segment alignment */ +}; + +/*---------------------------------------------------------------------------*/ +/* Segment Types (value of "p_type") */ +/*---------------------------------------------------------------------------*/ +enum +{ + PT_NULL = 0, /* Unused table entry */ + PT_LOAD = 1, /* Loadable segment */ + PT_DYNAMIC = 2, /* Dynamic linking information */ + PT_INTERP = 3, /* Interpreter path string location */ + PT_NOTE = 4, /* Location and size of auxiliary information */ + PT_SHLIB = 5, /* Shared library information */ + PT_PHDR = 6, /* Location and size of program header table */ + PT_TLS = 7, /* Specifies the Thread-Local Storage template */ + PT_LOOS = 0x60000000, /* First OS-specific value */ + PT_HIOS = 0x6fffffff, /* Last OS-specific value */ + PT_LOPROC = 0x70000000, /* First processor-specific value */ + PT_HIPROC = 0x7fffffff /* Last processor-specific value */ +}; + +/*---------------------------------------------------------------------------*/ +/* Segment Permissions (value of "p_flags") */ +/*---------------------------------------------------------------------------*/ +enum +{ + PF_X = 0x1, /* Execute */ + PF_W = 0x2, /* Write */ + PF_R = 0x4, /* Read */ + PF_MASKOS = 0x0ff00000, /* OS-specific mask */ + PF_MASKPROC = 0xf0000000 /* Processor-specific mask */ +}; + +/*****************************************************************************/ +/* Sections */ +/* PP. 1-9 */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* Section Header Data Structure */ +/*---------------------------------------------------------------------------*/ +struct Elf32_Shdr +{ + Elf32_Word sh_name; /* Section name (offset into string section) */ + Elf32_Word sh_type; /* Section type */ + Elf32_Word sh_flags; /* Section flags */ + Elf32_Addr sh_addr; /* Address in memory image */ + Elf32_Off sh_offset; /* File offset of section data */ + Elf32_Word sh_size; /* Size of the section in bytes */ + Elf32_Word sh_link; /* Link to the section header table */ + Elf32_Word sh_info; /* Extra information depending on section type */ + Elf32_Word sh_addralign; /* Address alignment constraints */ + Elf32_Word sh_entsize; /* Size of fixed-size entries in section */ +}; + +/*---------------------------------------------------------------------------*/ +/* Special Section Indexes */ +/*---------------------------------------------------------------------------*/ +enum +{ + SHN_UNDEF = 0, /* Referenced by undefined values */ + SHN_LORESERVE = 0xff00, /* First reserved index */ + SHN_LOPROC = 0xff00, /* First processor-specific index */ + SHN_HIPROC = 0xff1f, /* Last processor-specific index */ + SHN_LOOS = 0xff20, /* First OS-specific index */ + SHN_HIOS = 0xff3f, /* Last OS-specific index */ + SHN_ABS = 0xfff1, /* Referenced by absolute values */ + SHN_COMMON = 0xfff2, /* Referenced by common values */ + SHN_XINDEX = 0xffff, /* Indirect index reference (escape value) */ + SHN_HIRESERVE = 0xffff /* Last reserved index */ +}; + +/*---------------------------------------------------------------------------*/ +/* Section Types (value of "sh_type") */ +/*---------------------------------------------------------------------------*/ +enum +{ + SHT_NULL = 0, /* Inactive section */ + SHT_PROGBITS = 1, /* Application-specific information */ + SHT_SYMTAB = 2, /* Symbol table */ + SHT_STRTAB = 3, /* String table */ + SHT_RELA = 4, /* Relocation entries (explicit addends) */ + SHT_HASH = 5, /* Symbol hash table */ + SHT_DYNAMIC = 6, /* Dynamic linking information */ + SHT_NOTE = 7, /* Miscellaneous information */ + SHT_NOBITS = 8, /* Contains no data in file */ + SHT_REL = 9, /* Relocation entries (no expl. addends) */ + SHT_SHLIB = 10, /* Shared library */ + SHT_DYNSYM = 11, /* Dynamic symbol table */ + SHT_INIT_ARRAY = 14, /* Pointers to initialization functions */ + SHT_FINI_ARRAY = 15, /* Pointers to termination functions */ + SHT_PREINIT_ARRAY = 16, /* Pointers to pre-init functions */ + SHT_GROUP = 17, /* Section group */ + SHT_SYMTAB_SHNDX = 18, /* Section indexes for SHN_XINDEX refs. */ + SHT_LOOS = 0x60000000, /* First OS-specific type */ + SHT_HIOS = 0x6fffffff, /* Last OS-specific type */ + SHT_LOPROC = 0x70000000, /* First processor-specific type */ + SHT_HIPROC = 0x7fffffff, /* Last processor-specific type */ + SHT_LOUSER = 0x80000000, /* First application-specific type */ + SHT_HIUSER = 0xffffffff /* Last application-specific type */ +}; + +/*---------------------------------------------------------------------------*/ +/* Section Attribute Flags (value of "sh_flags") */ +/*---------------------------------------------------------------------------*/ +enum +{ + SHF_WRITE = 0x1, /* Writable during process execution */ + SHF_ALLOC = 0x2, /* Loaded into processor memory */ + SHF_EXECINSTR = 0x4, /* Contains executable instructions */ + SHF_MERGE = 0x10, /* Can be merged */ + SHF_STRINGS = 0x20, /* Contains null-terminated strings */ + SHF_INFO_LINK = 0x40, /* sh_info contains a section index */ + SHF_LINK_ORDER = 0x80, /* Maintain section ordering */ + SHF_OS_NONCONFORMING = 0x100, /* OS-specific processing required */ + SHF_GROUP = 0x200, /* Member of a section group */ + SHF_TLS = 0x400, /* Contains Thread-Local Storage */ + SHF_MASKOS = 0x0ff00000, /* Mask of OS-specific flags */ + SHF_MASKPROC = 0xf0000000 /* Mask for processor-specific flags */ +}; + +/*---------------------------------------------------------------------------*/ +/* Section Group Flags */ +/*---------------------------------------------------------------------------*/ +enum +{ + GRP_COMDAT = 0x1, /* Common data; only one is kept by linker */ + GRP_MASKOS = 0x0ff00000, /* Mask for OS-specific group flags */ + GRP_MASKPROC = 0xf0000000 /* Mask for processor-specific group flags */ +}; + + +/*****************************************************************************/ +/* Symbol Table */ +/* PP. 1-18 */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* Symbol Table Entry Data Structure */ +/*---------------------------------------------------------------------------*/ +struct Elf32_Sym +{ + Elf32_Word st_name; /* String table offset for symbol name */ + Elf32_Addr st_value; /* Symbol value */ + Elf32_Word st_size; /* Symbol size */ + uint8_t st_info; /* Symbol type and binding */ + uint8_t st_other; /* Symbol visibility */ + Elf32_Half st_shndx; /* Symbol type / defining section */ +}; + +/*---------------------------------------------------------------------------*/ +/* Undefined Symbol Index */ +/*---------------------------------------------------------------------------*/ +enum +{ + STN_UNDEF = 0 /* First symbol table entry is always undefined */ +}; + +/*---------------------------------------------------------------------------*/ +/* Symbol Binding and Type Utility Functions. */ +/*---------------------------------------------------------------------------*/ +static inline uint8_t ELF32_ST_BIND(uint8_t i) { return (i >> 4); } +static inline uint8_t ELF32_ST_TYPE(uint8_t i) { return (i & 0xf); } +static inline uint8_t ELF32_ST_INFO(uint8_t b, uint8_t t) + { return ((b << 4) + (t & 0xf)); } +static inline uint8_t ELF32_ST_VISIBILITY(uint8_t o) { return (o & 0x3); } + + +/*---------------------------------------------------------------------------*/ +/* Symbol Binding (value returned by ELF32_ST_BIND()) */ +/*---------------------------------------------------------------------------*/ +enum +{ + STB_LOCAL = 0, /* Symbol does not have external linkage */ + STB_GLOBAL = 1, /* Symbol has external linkage */ + STB_WEAK = 2, /* Symbol has weak external linkage */ + STB_LOOS = 10, /* First OS-specific binding */ + STB_HIOS = 12, /* Last OS-specific binding */ + STB_LOPROC = 13, /* First processor-specific binding */ + STB_HIPROC = 15 /* Last processor-specific binding */ +}; + +/*---------------------------------------------------------------------------*/ +/* Symbol Types (value returned by ELF32_ST_TYPE()) */ +/*---------------------------------------------------------------------------*/ +enum +{ + STT_NOTYPE = 0, /* Unspecified type */ + STT_OBJECT = 1, /* Associated with a data object */ + STT_FUNC = 2, /* Associated with executable code */ + STT_SECTION = 3, /* Associated with a section */ + STT_FILE = 4, /* Associated with a source file */ + STT_COMMON = 5, /* Labels an uninitialized common block */ + STT_TLS = 6, /* Specifies a thread-local storage entity */ + STT_LOOS = 10, /* First OS-specific type */ + STT_HIOS = 12, /* Last OS-specific type */ + STT_LOPROC = 13, /* First processor-specific type */ + STT_HIPROC = 15 /* Last processor-specific type */ +}; + +/*---------------------------------------------------------------------------*/ +/* Symbol Visibility (value returned by ELF32_ST_VISIBILITY()) */ +/*---------------------------------------------------------------------------*/ +enum +{ + STV_DEFAULT = 0, /* Visibility specified by binding type */ + STV_INTERNAL = 1, /* Like STV_HIDDEN, with processor-specific semantics */ + STV_HIDDEN = 2, /* Not visible to other components */ + STV_PROTECTED = 3 /* Visible in other components but not preemptable */ +}; + +/*****************************************************************************/ +/* Relocation */ +/* PP. 1-22 */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* Relocation Entries Data Structures */ +/*---------------------------------------------------------------------------*/ +struct Elf32_Rel +{ + Elf32_Addr r_offset; /* Offset of the relocatable value in the section */ + Elf32_Word r_info; /* Symbol table index and relocation type */ +}; + +struct Elf32_Rela +{ + Elf32_Addr r_offset; /* Offset of the relocatable value in the section */ + Elf32_Word r_info; /* Symbol table index and relocation type */ + Elf32_Sword r_addend; /* Constant addend used to compute new value */ +}; + +/*---------------------------------------------------------------------------*/ +/* Relocation Symbol and Type Utility Functions. */ +/*---------------------------------------------------------------------------*/ +static inline uint32_t ELF32_R_SYM(uint32_t i) { return (i >> 8); } +static inline uint8_t ELF32_R_TYPE(uint32_t i) { return (i & 0xFF); } +static inline uint32_t ELF32_R_INFO(uint32_t s, uint8_t t) + { return ((s << 8) + t); } + + +/*****************************************************************************/ +/* Dynamic Section */ +/* PP. 2-8 */ +/*****************************************************************************/ +struct Elf32_Dyn +{ + Elf32_Sword d_tag; + union + { + Elf32_Word d_val; + Elf32_Addr d_ptr; + } d_un; +}; + +/* Name Value d_un Executable Shared Obj. */ +/* ---- ----- ---- ---------- ----------- */ +enum +{ + DT_NULL = 0, /* ignored mandatory mandatory */ + DT_NEEDED = 1, /* d_val optional optional */ + DT_PLTRELSZ = 2, /* d_val optional optional */ + DT_PLTGOT = 3, /* d_ptr optional optional */ + DT_HASH = 4, /* d_ptr mandatory mandatory */ + DT_STRTAB = 5, /* d_ptr mandatory mandatory */ + DT_SYMTAB = 6, /* d_ptr mandatory mandatory */ + DT_RELA = 7, /* d_ptr mandatory optional */ + DT_RELASZ = 8, /* d_val mandatory optional */ + DT_RELAENT = 9, /* d_val mandatory optional */ + DT_STRSZ = 10, /* d_val mandatory mandatory */ + DT_SYMENT = 11, /* d_val mandatory mandatory */ + DT_INIT = 12, /* d_ptr optional optional */ + DT_FINI = 13, /* d_ptr optional optional */ + DT_SONAME = 14, /* d_val ignored optional */ + DT_RPATH = 15, /* d_val optional ignored */ + DT_SYMBOLIC = 16, /* ignored ignored optional */ + DT_REL = 17, /* d_ptr mandatory optional */ + DT_RELSZ = 18, /* d_val mandatory optional */ + DT_RELENT = 19, /* d_val mandatory optional */ + DT_PLTREL = 20, /* d_val optional optional */ + DT_DEBUG = 21, /* d_ptr optional ignored */ + DT_TEXTREL = 22, /* ignored optional optional */ + DT_JMPREL = 23, /* d_ptr optional optional */ + DT_BIND_NOW = 24, /* ignored optional optional */ + DT_INIT_ARRAY = 25, /* d_ptr optional optional */ + DT_FINI_ARRAY = 26, /* d_ptr optional optional */ + DT_INIT_ARRAYSZ = 27, /* d_val optional optional */ + DT_FINI_ARRAYSZ = 28, /* d_val optional optional */ + DT_RUNPATH = 29, /* d_val optional optional */ + DT_FLAGS = 30, /* d_val optional optional */ + DT_ENCODING = 32, /* unspecified unspecified unspecified */ + DT_PREINIT_ARRAY = 32, /* d_ptr optional ignored */ + DT_PREINIT_ARRAYSZ = 33, /* d_val optional ignored */ + DT_LOOS = 0x60000000, /* unspecified unspecified unspecified */ + DT_HIOS = 0x6ffff000, /* unspecified unspecified unspecified */ + DT_LOPROC = 0x70000000, /* unspecified unspecified unspecified */ + DT_HIPROC = 0x7fffffff /* unspecified unspecified unspecified */ +}; + + +/*---------------------------------------------------------------------------*/ +/* DT_FLAGS values. */ +/*---------------------------------------------------------------------------*/ +enum +{ + DF_ORIGIN = 0x01, /* loaded object may reference $ORIGIN subst. string */ + DF_SYMBOLIC = 0x02, /* changes dynamic linker symbol resolution */ + DF_TEXTREL = 0x04, /* do not allow relocation of non-writable segments */ + DF_BIND_NOW = 0x08, /* don't use lazy binding */ + DF_STATIC_TLS = 0x10, /* do not load this file dynamically */ + DF_DIRECT_DEPENDENT = 0x20, /* limit global sym lookup to dependent list */ + DF_WORLD = 0x40 /* Linux style global sym lookup, breadth-first */ +}; + + +/*---------------------------------------------------------------------------*/ +/* Dynamic Tag Database. */ +/*---------------------------------------------------------------------------*/ + +/* Specifiers for which d_un union member to use */ + +enum +{ + EDYN_UNTYPE_IGNORED, + EDYN_UNTYPE_VAL, + EDYN_UNTYPE_PTR, + EDYN_UNTYPE_UNSPECIFIED +}; + + +/* Specifiers for executable/shared object file requirements */ + +enum +{ + EDYN_TAGREQ_IGNORED, + EDYN_TAGREQ_MANDATORY, + EDYN_TAGREQ_OPTIONAL, + EDYN_TAGREQ_UNSPECIFIED +}; + + +/* Data structure for one dynamic tag database entry */ + +struct EDYN_TAG +{ + const char* d_tag_name; /* tag name string */ + Elf32_Sword d_tag_value; /* DT_* tag value */ + Elf32_Word d_untype; /* which d_un union member to use */ + Elf32_Word d_exec_req; /* requirement for executable files */ + Elf32_Word d_shared_req; /* requirement for shared object files */ +}; + +extern const struct EDYN_TAG EDYN_TAG_DB[]; + +/*****************************************************************************/ +/* Special Section Database */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* Special Section Names */ +/*---------------------------------------------------------------------------*/ +#define ESCN_BSS_name ".bss" +#define ESCN_COMMENT_name ".comment" +#define ESCN_DATA1_name ".data1" +#define ESCN_DATA_name ".data" +#define ESCN_DEBUG_name ".debug" +#define ESCN_DYNAMIC_name ".dynamic" +#define ESCN_DYNSTR_name ".dynstr" +#define ESCN_DYNSYM_name ".dynsym" +#define ESCN_FINI_ARRAY_name ".fini_array" +#define ESCN_FINI_name ".fini" +#define ESCN_GOT_name ".got" +#define ESCN_HASH_name ".hash" +#define ESCN_INIT_ARRAY_name ".init_array" +#define ESCN_INIT_name ".init" +#define ESCN_INTERP_name ".interp" +#define ESCN_LINE_name ".line" +#define ESCN_NOTE_name ".note" +#define ESCN_PLT_name ".plt" +#define ESCN_PREINIT_ARRAY_name ".preinit_array" +#define ESCN_RELA_name ".rela" +#define ESCN_REL_name ".rel" +#define ESCN_RODATA1_name ".rodata1" +#define ESCN_RODATA_name ".rodata" +#define ESCN_SHSTRTAB_name ".shstrtab" +#define ESCN_STRTAB_name ".strtab" +#define ESCN_SYMTAB_SHNDX_name ".symtab_shndx" +#define ESCN_SYMTAB_name ".symtab" +#define ESCN_TBSS_name ".tbss" +#define ESCN_TDATA1_name ".tdata1" +#define ESCN_TDATA_name ".tdata" +#define ESCN_TEXT_name ".text" +#define ESCN_ATTRIBUTES_name "__TI_build_attributes" +#define ESCN_ICODE_name "__TI_ICODE" +#define ESCN_XREF_name "__TI_XREF" + +/*---------------------------------------------------------------------------*/ +/* Special Section Information Data Structure. */ +/*---------------------------------------------------------------------------*/ +struct ESCN +{ + const char *name; + Elf32_Word sh_type; + Elf32_Word sh_entsize; + Elf32_Word sh_flags; +}; + +extern const struct ESCN ESCN_DB[]; + +#endif /* ELF32_H */ diff --git a/src/core/dsp/ocl_load/DLOAD/relocate.h b/src/core/dsp/ocl_load/DLOAD/relocate.h new file mode 100644 index 0000000..ee21aa9 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/relocate.h @@ -0,0 +1,64 @@ +/* +* relocate.h +* +* Declare names and IDs of all C6x-specific relocation types supported +* in the dynamic loader. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef RELOCATE_H +#define RELOCATE_H + +#include <inttypes.h> +#include "elf32.h" +#include "dload.h" +#include "dload_api.h" + +/*---------------------------------------------------------------------------*/ +/* Declare some globals that are used for internal debugging and profiling. */ +/*---------------------------------------------------------------------------*/ +#if LOADER_DEBUG || LOADER_PROFILE +#include <time.h> +extern int DLREL_relocations; +extern time_t DLREL_total_reloc_time; +#endif + + +/*---------------------------------------------------------------------------*/ +/* Landing point for core loader's relocation processor. */ +/*---------------------------------------------------------------------------*/ +void DLREL_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd, + DLIMP_Dynamic_Module *dyn_module); + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/symtab.h b/src/core/dsp/ocl_load/DLOAD/symtab.h new file mode 100644 index 0000000..1f06584 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/symtab.h @@ -0,0 +1,72 @@ +/* +* symtab.h +* +* Specification of functions used by the core loader to create, maintain, +* and destroy internal symbol tables. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef SYMTAB_H +#define SYMTAB_H + +#include "ArrayList.h" +#include "dload.h" + +/*****************************************************************************/ +/* This is the top-level application file handle. It should only be needed */ +/* under the Linux and DSBT models. */ +/*****************************************************************************/ +extern int32_t DLIMP_application_handle; + +/*---------------------------------------------------------------------------*/ +/* Core Loader Symbol Table Management Functions */ +/*---------------------------------------------------------------------------*/ +BOOL DLSYM_canonical_lookup(DLOAD_HANDLE handle, + int32_t sym_index, + DLIMP_Dynamic_Module *dyn_module, + Elf32_Addr *sym_value); + +BOOL DLSYM_global_lookup(DLOAD_HANDLE handle, + const char *sym_name, + DLIMP_Loaded_Module *pentry, + Elf32_Addr *sym_value); + +BOOL DLSYM_lookup_local_symtab(const char *sym_name, + struct Elf32_Sym *symtab, + Elf32_Word symnum, + Elf32_Addr *sym_value); + +void DLSYM_copy_globals(DLIMP_Dynamic_Module *dyn_module); + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/util.h b/src/core/dsp/ocl_load/DLOAD/util.h new file mode 100644 index 0000000..24c5b3f --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/util.h @@ -0,0 +1,89 @@ +/* +* util.h +* +* Definition of some useful string comparison routines (not +* not provided on all platforms) and a few generic macros. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef UTIL_H +#define UTIL_H + +#include <ctype.h> + +#if !defined(__linux) + +/*****************************************************************************/ +/* STRCASECMP() - Case-insensitive strcmp. */ +/*****************************************************************************/ +static int strcasecmp(const char* s1, const char* s2) +{ + char c1, c2; + do { c1 = *s1++; c2 = *s2++; } + while (c1 && c2 && (tolower(c1) == tolower(c2))); + + return tolower(c1) - tolower(c2); +} + +/*****************************************************************************/ +/* STRNCASECMP() - Case-insensitive strncmp. */ +/*****************************************************************************/ +static int strncasecmp(const char* s1, const char* s2, size_t n) +{ + char c1, c2; + + if (!n) return 0; + + do { c1 = *s1++; c2 = *s2++; } + while (--n && c1 && c2 && (tolower(c1) == tolower(c2))); + + return tolower(c1) - tolower(c2); +} + +#endif + +/*****************************************************************************/ +/* Define MIN and MAX macros. */ +/*****************************************************************************/ +#define MIN(x,y) (((x) > (y)) ? (y) : (x)) +#define MAX(x,y) (((x) >= (y)) ? (x) : (y)) + +/*****************************************************************************/ +/* C implementation of 'bool' type. */ +/*****************************************************************************/ +typedef int BOOL; +#define TRUE 1 +#define FALSE 0 + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/version.h b/src/core/dsp/ocl_load/DLOAD/version.h new file mode 100644 index 0000000..e36d1a9 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/version.h @@ -0,0 +1,63 @@ +/* +* version.h +* +* Dynamic Loader source version identifictaion. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef _VERSION_H_ +#define _VERSION_H_ + +/*****************************************************************************/ +/* VERSION NUMBER COMPONENTS - ALWAYS INCREASING!! */ +/* Initial version ID is 1.0.0. Successive version ID's will be incremented */ +/* by automated processes during release port. */ +/*****************************************************************************/ +#define VERSION_MAJOR 1 +#define VERSION_MINOR 0 +#define VERSION_PATCH 0 + +/******************************************************************************/ +/* Macros used to convert version macros into strings. */ +/******************************************************************************/ +#define MKCSTR(_str) #_str +#define MKMSTR(_str) MKCSTR(_str) + +/******************************************************************************/ +/* VERSION string construction macros. */ +/******************************************************************************/ +#define VERSTR MKMSTR(VERSION_MAJOR) "." MKMSTR(VERSION_MINOR) "." MKMSTR(VERSION_PATCH) +#define VERSION "Texas Instruments Dynamic Loader API/Core v"VERSTR + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD/virtual_targets.h b/src/core/dsp/ocl_load/DLOAD/virtual_targets.h new file mode 100644 index 0000000..1d44b4d --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD/virtual_targets.h @@ -0,0 +1,90 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "dload.h" +#include "elf32.h" + +#ifdef C60_TARGET +#include "c60_dynamic.h" +#include "c60_reloc.h" +#endif + +#ifdef ARM_TARGET +#include "arm_dynamic.h" +#include "arm_reloc.h" +#endif + +/*****************************************************************************/ +/* Define a virtual target class to give access to target specific functions */ +/*****************************************************************************/ +typedef struct vtarget +{ + int machine_id; + + BOOL (*relocate_dynamic_tag_info)(DLIMP_Dynamic_Module *dyn_module, int i); + BOOL (*process_eiosabi)(DLIMP_Dynamic_Module* dyn_module); + BOOL (*process_dynamic_tag)(DLIMP_Dynamic_Module *dyn_module, int i); + void (*relocate)(DLOAD_HANDLE handle, LOADER_FILE_DESC *elf_file, + DLIMP_Dynamic_Module *dyn_module); + +} VIRTUAL_TARGET; + + + +/*****************************************************************************/ +/* Populate this for each target supported. */ +/*****************************************************************************/ +VIRTUAL_TARGET vt_arr[] = { + +#ifdef C60_TARGET + { + EM_TI_C6000, + DLDYN_c60_relocate_dynamic_tag_info, + DLDYN_c60_process_eiosabi, + DLDYN_c60_process_dynamic_tag, + DLREL_c60_relocate + }, +#endif +#ifdef ARM_TARGET + { + EM_ARM, + DLDYN_arm_relocate_dynamic_tag_info, + DLDYN_arm_process_eiosabi, + DLDYN_arm_process_dynamic_tag, + DLREL_arm_relocate + }, +#endif + { + EM_NONE, + 0, + 0, + 0, + 0 + } +}; + + diff --git a/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log b/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log new file mode 100644 index 0000000..689cfe6 --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log @@ -0,0 +1,33 @@ + + Dynamic Loader API and Loader Core - Version Number Change Log + ============================================================== + + Version Number Description + -------------------------- + + The version number associated with the Dynamic Loader API and the Loader Core + sources has three components: + + <major version>.<minor version>.<patch version> + + major version - is incremented if there is a change to the API that creates a + compatibility discontinuity. + + minor version - is incremented if functionality is added to the API without + causing a compatibility discontinuity. + + patch version - is incremented if a defect has been repaired, a performance + enhancement has been added, or the source code has been + refactored in some way. There should not be a compatibility + discontinuity created by an increment to the patch version. + + Version Number Change Log + ------------------------- + + 1.0.0 - 17 July 2009 - Initial release of dynamic loader API and loader + core sources. + + 2.0.0 - 1 Feb 2013 - Add client handle to several DLIF functions. + - Add DLIF_exit() for loader abort. + + diff --git a/src/core/dsp/ocl_load/DLOAD_API/dload_api.h b/src/core/dsp/ocl_load/DLOAD_API/dload_api.h new file mode 100644 index 0000000..95de10f --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD_API/dload_api.h @@ -0,0 +1,700 @@ +/* +* dload_api.h +* +* Dynamic Loader API Specification +* -------------------------------- +* +* Client-side of API is assumed to be platform dependent, but object file +* format independent. +* +* Core Loader side of API is assumed to be platform independent, but +* object file format dependent and target dependent. +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef DLOAD_API_H +#define DLOAD_API_H + +#include <inttypes.h> +#include <stdio.h> +#include "util.h" + +extern int debugging_on; + +/*****************************************************************************/ +/* Specification of Loader File Descriptor. If client side of the loader */ +/* supports virtual memory, this may need to be updated to facilitate the */ +/* use of mmap(). */ +/*****************************************************************************/ +typedef FILE LOADER_FILE_DESC; + +static const int LOADER_SEEK_SET = SEEK_SET; +static const int LOADER_SEEK_CUR = SEEK_CUR; +static const int LOADER_SEEK_END = SEEK_END; + +/*****************************************************************************/ +/* TARGET_ADDRESS - type suitable for storing target memory address values. */ +/*****************************************************************************/ +typedef uint32_t TARGET_ADDRESS; + +/*****************************************************************************/ +/* Define DLOAD Object Handle */ +/*****************************************************************************/ +typedef void * DLOAD_HANDLE; + +/*****************************************************************************/ +/* Core Loader Provided API Functions (Core Loader Entry Points) */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* DLOAD_version() */ +/* */ +/* Return a string constant representation for the version ID of the */ +/* dynamic loader's core loader source code. */ +/* */ +/*---------------------------------------------------------------------------*/ +#include "version.h" +#define DLOAD_version() VERSION + +/*---------------------------------------------------------------------------*/ +/* DLOAD_create() */ +/* */ +/* Construct and initialize the dynamic loader core's handle. */ +/* */ +/*---------------------------------------------------------------------------*/ +DLOAD_HANDLE DLOAD_create(void * client_handle); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_destroy() */ +/* */ +/* Destroy and finalize the dynamic loader core's handle. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLOAD_destroy(DLOAD_HANDLE handle); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_initialize() */ +/* */ +/* Construct and initialize data structures internal to the dynamic */ +/* loader core. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLOAD_initialize(DLOAD_HANDLE handle); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_finalize() */ +/* */ +/* Destroy and finalize data structures internal to the dynamic */ +/* loader core. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLOAD_finalize(DLOAD_HANDLE handle); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_load_symbols() */ +/* */ +/* Load externally visible symbols from the specified file so that they */ +/* can be linked against when another object file is subsequntly loaded. */ +/* External symbols will be made available for global symbol linkage. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_load_symbols(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_load() */ +/* */ +/* Dynamically load the specified file and return a file handle for the */ +/* loaded file. If the load fails, this function will return a value */ +/* zero (0). */ +/* */ +/* The core loader must have read access to the file pointed by fp. */ +/* */ +/*---------------------------------------------------------------------------*/ +int DLOAD_load(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_unload() */ +/* */ +/* Given a file handle ID, unload all object segments associated with */ +/* the identified file and any of its dependents that are not still in */ +/* use. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_unload(DLOAD_HANDLE handle, uint32_t pseudopid); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_get_entry_names_info() */ +/* */ +/* Given a file handle, return the number entry points that are */ +/* available in the specified file as well as the max name length. This */ +/* can then be used by the client to allocate the appropriate amount of */ +/* memory needed to call DLOAD_get_entry_names() */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_get_entry_names_info(DLOAD_HANDLE handle, uint32_t file_handle, + int32_t *entry_pt_cnt, + int32_t *entry_pt_max_name_len); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_get_entry_names() */ +/* */ +/* Given a file handle, build a list of entry point names that are */ +/* available in the specified file. This can be used when querying */ +/* the list of global functions available in a shared library. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_get_entry_names(DLOAD_HANDLE handle, uint32_t file_handle, + int32_t* entry_pt_cnt, char*** entry_pt_names); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_query_symbol() */ +/* */ +/* Query the value of a symbol that is defined by an object file that */ +/* has previously been loaded. Boolean return value will be false if */ +/* the symbol is not found. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_query_symbol(DLOAD_HANDLE handle, uint32_t file_handle, + const char *sym_name, TARGET_ADDRESS *sym_val); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_get_entry_point() */ +/* */ +/* Given a file handle, return the entry point target address associated */ +/* with that object file. The entry point address value is written to */ +/* *sym_val. The return value of the function indicates whether the */ +/* file with the specified handle was found or not. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_get_entry_point(DLOAD_HANDLE handle, uint32_t file_handle, + TARGET_ADDRESS *sym_val); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_load_arguments() */ +/* */ +/* Given a file handle, find the object file assicated with that handle */ +/* and copy the argc/argv information from the client into that object */ +/* file's .args section. The return value indicates whether the operation */ +/* was successful. If there are no loaded object files which match the */ +/* handle or if there is insufficient space in the .args section to hold */ +/* the specified argc/argv information, the function will return false. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_load_arguments(DLOAD_HANDLE handle, uint32_t file_handle, + int argc, char** argv); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_prepare_for_execution() */ +/* */ +/* Given a file handle, prepare for execution : */ +/* - Return entry point associated with that module in the *sym_val */ +/* output parameter. */ +/* - Write out the given arguments to the .args section contained in the */ +/* same module. */ +/* - As a test (for the Reference implementation) read the arguments */ +/* using the DLIF_read_arguments() function and set global argc,argv. */ +/* */ +/* The return value of the function indicates whether the file with the */ +/* specified handle was found or not. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_prepare_for_execution(DLOAD_HANDLE handle, uint32_t file_handle, + TARGET_ADDRESS *sym_val, + int argc, char** argv); + + +/*****************************************************************************/ +/* Client Provided API Functions */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* File I/O */ +/* */ +/* The client side of the dynamic loader must provide basic file I/O */ +/* capabilities so that the core loader has random access into any */ +/* object file that it is asked to load. */ +/* */ +/* The client side of the dynamic loader must provide a definition of */ +/* the LOADER_FILE_DESC in dload_filedefs.h. This allows the core loader */ +/* to be independent of how the client accesses raw data in an object */ +/* file. */ +/* */ +/*---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------*/ +/* DLIF_fseek() */ +/* */ +/* Seek to a position in a file (accessed via 'stream') based on the */ +/* values for offset and origin. */ +/* */ +/*---------------------------------------------------------------------------*/ +int DLIF_fseek(LOADER_FILE_DESC *stream, int32_t offset, int origin); + +/*---------------------------------------------------------------------------*/ +/* DLIF_ftell() */ +/* */ +/* Return the current file position in the file identified in the */ +/* LOADER_FILE_DESC pointed to by 'stream'. */ +/* */ +/*---------------------------------------------------------------------------*/ +int32_t DLIF_ftell(LOADER_FILE_DESC *stream); + +/*---------------------------------------------------------------------------*/ +/* DLIF_fread() */ +/* */ +/* Read 'size' * 'nmemb' bytes of data from the file identified in the */ +/* LOADER_FILE_DESC object pointed to by 'stream', and write that data */ +/* into the memory accessed via 'ptr'. */ +/* */ +/*---------------------------------------------------------------------------*/ +size_t DLIF_fread(void *ptr, size_t size, size_t nmemb, + LOADER_FILE_DESC *stream); + +/*---------------------------------------------------------------------------*/ +/* DLIF_fclose() */ +/* */ +/* Close a file that was opened on behalf of the core loader. Ownership */ +/* of the file pointer in question belongs to the core loader, but the */ +/* client has exclusive access to the file system. */ +/* */ +/*---------------------------------------------------------------------------*/ +int DLIF_fclose(LOADER_FILE_DESC *fd); + +/*---------------------------------------------------------------------------*/ +/* Host Memory Management */ +/* */ +/* Allocate and free host memory as needed for the dynamic loader's */ +/* internal data structures. If the dynamic loader resides on the */ +/* target architecture, then this memory is allocated from a target */ +/* memory heap that must be managed separately from memory that is */ +/* allocated for a dynamically loaded object file. */ +/* */ +/*---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------*/ +/* DLIF_malloc() */ +/* */ +/* Allocate 'size' bytes of memory space that is usable as scratch space */ +/* (appropriate for the loader's internal data structures) by the dynamic */ +/* loader. */ +/* */ +/* If allocation fails, this function must not return. */ +/* */ +/*---------------------------------------------------------------------------*/ +void* DLIF_malloc(size_t size); + +/*---------------------------------------------------------------------------*/ +/* DLIF_free() */ +/* */ +/* Free memory space that was previously allocated by DLIF_malloc(). */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLIF_free(void* ptr); + +/*---------------------------------------------------------------------------*/ +/* Target Memory Allocator Interface */ +/* */ +/* The client side of the dynamic loader must create and maintain an */ +/* infrastructure to manage target memory. The client must keep track */ +/* of what target memory is associated with each object segment, */ +/* allocating target memory for newly loaded objects and release target */ +/* memory that is associated with objects that are being unloaded from */ +/* the target architecture. */ +/* */ +/* The two client-supplied functions, DLIF_allocate() and DLIF_release(), */ +/* are used by the core loader to interface into the client side's */ +/* target memory allocator infrastructure. */ +/* */ +/*---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------*/ +/* DLOAD_SEGMENT_FLAGS - segment characteristics. */ +/*---------------------------------------------------------------------------*/ +typedef uint32_t DLOAD_SEGMENT_FLAGS; +static const int DLOAD_SF_executable = 0x1; /* Memory must be executable */ +static const int DLOAD_SF_relocatable = 0x2; /* Segment must be relocatable */ +static const int DLOAD_SF_writable = 0x4; /* Memory must be writable */ + +/*---------------------------------------------------------------------------*/ +/* DLOAD_MEMORY_SEGMENT - Define structure to represent placement and size */ +/* details of a segment to be loaded. */ +/*---------------------------------------------------------------------------*/ +struct DLOAD_MEMORY_SEGMENT +{ + uint32_t target_page; /* requested/returned memory page */ + TARGET_ADDRESS target_address; /* requested/returned address */ + uint32_t objsz_in_bytes; /* size of init'd part of segment */ + uint32_t memsz_in_bytes; /* size of memory block for segment */ +// DLOAD_SEGMENT_FLAGS flags; /* allocation request flags */ +}; + +/*---------------------------------------------------------------------------*/ +/* DLOAD_MEMORY_REQUEST - Define structure to represent a target memory */ +/* request made by the core loader on behalf of a segment that the */ +/* loader needs to relocate and write into target memory. */ +/*---------------------------------------------------------------------------*/ +struct DLOAD_MEMORY_REQUEST +{ + LOADER_FILE_DESC *fp; /* file being loaded */ + struct DLOAD_MEMORY_SEGMENT *segment; /* obj for req/ret alloc */ + void *host_address; /* ret hst ptr from DLIF_copy()*/ + BOOL is_loaded; /* returned as true if segment */ + /* is already in target memory */ + uint32_t offset; /* file offset of segment's */ + /* raw data */ + uint32_t flip_endian; /* endianness of trg opp host */ + DLOAD_SEGMENT_FLAGS flags; /* allocation request flags */ + uint32_t align; /* align of trg memory block */ +}; + +/*---------------------------------------------------------------------------*/ +/* DLIF_initMem() */ +/* */ +/* Given an address and size, initialize the memory used to load the */ +/* dynamic segments. This should be called by the client before */ +/* beginning dynamic loading. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_initMem(void* client_handle, uint32_t dynMemAddr, uint32_t size); + +/*---------------------------------------------------------------------------*/ +/* DLIF_deinitMem() */ +/* */ +/* De-initialize the memory used to load the dynamic segments. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_deinitMem(void* client_handle); + +/*---------------------------------------------------------------------------*/ +/* DLIF_allocate() */ +/* */ +/* Given a DLOAD_MEMORY_REQUEST created by the core loader, allocate */ +/* target memory to fulfill the request using the target memory */ +/* management infrastrucutre on the client side of the dynamic loader. */ +/* The contents of the DLOAD_MEMORY_REQUEST will be updated per the */ +/* details of a successful allocation. The allocated page and address */ +/* can be found in the DLOAD_MEMORY_SEGMENT attached to the request. */ +/* The boolean return value reflects whether the allocation was */ +/* successful or not. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_allocate(void* client_handle, struct DLOAD_MEMORY_REQUEST *req); + +/*---------------------------------------------------------------------------*/ +/* DLIF_release() */ +/* */ +/* Given a DLOAD_MEMORY_SEGMENT description, free the target memory */ +/* associated with the segment using the target memory management */ +/* infrastructure on the client side of the dynamic loader. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_release(void* client_handle, struct DLOAD_MEMORY_SEGMENT* ptr); + +/*---------------------------------------------------------------------------*/ +/* Target Memory Access / Write Services */ +/* */ +/* The client side's target memory allocator infrastructure communicates */ +/* with the core loader through the DLOAD_MEMORY_REQUEST and */ +/* DLOAD_MEMORY_SEGMENT data structures defined above. To complete the */ +/* loading of an object segment, the segment may need to be relocated */ +/* before it is actually written to target memory in the space that was */ +/* allocated for it by DLIF_allocate(). */ +/* */ +/* The client side of the dynamic loader provides two functions to help */ +/* complete the process of loading an object segment, DLIF_copy() and */ +/* DLIF_write(). */ +/* */ +/* These functions help to make the core loader truly independent of */ +/* whether it is running on the host or target architecture and how the */ +/* client provides for reading/writing from/to target memory. */ +/* */ +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ +/* DLIF_copy() */ +/* */ +/* Copy segment data from the object file described in the 'fp' and */ +/* 'offset' of the DLOAD_MEMORY_REQUEST into host accessible memory so */ +/* that it can relocated or otherwise manipulated by the core loader. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_copy(void* client_handle, struct DLOAD_MEMORY_REQUEST* req); + +/*---------------------------------------------------------------------------*/ +/* DLIF_write() */ +/* */ +/* Once the segment data described in the DLOAD_MEMORY_REQUEST is ready */ +/* (relocated, if needed), write the segment contents to the target */ +/* memory identified in the DLOAD_MEMORY_SEGMENT attached to the request. */ +/* */ +/* After the segment contents have been written to target memory, the */ +/* core loader should discard the DLOAD_MEMORY_REQUEST object, but retain */ +/* the DLOAD_MEMORY_SEGMENT object so that the target memory associated */ +/* with the segment can be releases when the segment is unloaded. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_write(void* client_handle, struct DLOAD_MEMORY_REQUEST* req); + +/*---------------------------------------------------------------------------*/ +/* DLIF_read() */ +/* */ +/* Given a host accessible buffer, read content of indicated target */ +/* memory address into the buffer. */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_read(void* client_handle, + void *ptr, size_t size, size_t nmemb, TARGET_ADDRESS src); + +/*---------------------------------------------------------------------------*/ +/* DLIF_memcpy() */ +/* */ +/* Given a host accessible buffer, copy content from specified buffer */ +/* into target memory. */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_memcpy(void* client_handle, void *to, void *from, size_t size); + +/*---------------------------------------------------------------------------*/ +/* DLIF_execute() */ +/* */ +/* Start execution on the target architecture from given 'exec_addr'. */ +/* If the dynamic loader is running on the target architecture, this can */ +/* be effected as a simple function call. */ +/* */ +/*---------------------------------------------------------------------------*/ +int32_t DLIF_execute(void* client_handle, TARGET_ADDRESS exec_addr); + +/*---------------------------------------------------------------------------*/ +/* Loading and Unloading of Dependent Files */ +/* */ +/* The dynamic loader core loader must coordinate loading and unloading */ +/* dependent object files with the client side of the dynamic loader. */ +/* This allows the client to keep its bookkeeping information up to date */ +/* with what is currently loaded on the target architecture. */ +/* */ +/* For instance, the client may need to interact with a file system or */ +/* registry. The client may also need to update debug information in */ +/* synch with the loading and unloading of shared objects. */ +/* */ +/*---------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ +/* DLIF_load_dependent() */ +/* */ +/* Ask client to find and open a dependent file identified by the */ +/* 'so_name' parameter, then, if necessary, initiate a DLOAD_load() */ +/* call to actually load the shared object onto the target. A */ +/* successful load will return a file handle ID that the client can */ +/* associate with the newly loaded file. */ +/* */ +/*---------------------------------------------------------------------------*/ +int DLIF_load_dependent(void* client_handle, const char* so_name); + +/*---------------------------------------------------------------------------*/ +/* DLIF_unload_dependent() */ +/* */ +/* Ask client to unload a dependent file identified by the 'file_handle' */ +/* parameter. Initiate a call to DLOAD_unload() to actually free up */ +/* the target memory that was occupied by the object file. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLIF_unload_dependent(void* client_handle, uint32_t file_handle); + +/*---------------------------------------------------------------------------*/ +/* Error/Warning Registration Functions */ +/* */ +/* The client will maintain an error/warning log. This will allow the */ +/* core loader to register errors and warnings in the load during a */ +/* given dynamic load. The client is required to check the log after */ +/* each load attempt to report any problems. */ +/* */ +/*---------------------------------------------------------------------------*/ + + +/*---------------------------------------------------------------------------*/ +/* Loader Warning Types */ +/*---------------------------------------------------------------------------*/ +typedef enum { + DLWT_MISC = 0, /* Miscellaneous warning */ + DLWT_FILE /* Warning missing/invalid file information */ +} LOADER_WARNING_TYPE; + +/*---------------------------------------------------------------------------*/ +/* DLIF_warning() */ +/* */ +/* Log a warning message with the client's error/warning handling */ +/* infrastructure. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLIF_warning(LOADER_WARNING_TYPE wtype, const char *fmt, ...); + +/*---------------------------------------------------------------------------*/ +/* Loader Error Types */ +/*---------------------------------------------------------------------------*/ +typedef enum { + DLET_MISC = 0, /* Miscellaneous error */ + DLET_FILE, /* Error reading/processing file */ + DLET_SYMBOL, /* Symbol resolution error */ + DLET_RELOC, /* Relocation error */ + DLET_MEMORY, /* Host memory allocation/free error */ + DLET_TRGMEM, /* Target memory allocation/free error */ + DLET_DEBUG /* Shared object or DLL debug error */ +} LOADER_ERROR_TYPE; + +/*---------------------------------------------------------------------------*/ +/* DLIF_error() */ +/* */ +/* Log an error message with the client's error/warning handling */ +/* infrastructure. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLIF_error(LOADER_ERROR_TYPE etype, const char *fmt, ...); + +/*---------------------------------------------------------------------------*/ +/* DLIF_exit() */ +/* */ +/* Abort the loader following a fatal error. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLIF_exit(int code); + +/*---------------------------------------------------------------------------*/ +/* DLIF_trace() */ +/* */ +/* Log a message with the client's trace handling infrastructure. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLIF_trace(const char *fmt, ...); + +/*---------------------------------------------------------------------------*/ +/* Dynamic Static Base Table (DSBT) Support Functions */ +/*---------------------------------------------------------------------------*/ +#define DSBT_INDEX_INVALID -1 +#define DSBT_DSBT_BASE_INVALID 0 +#define DSBT_STATIC_BASE_INVALID 0 + +/*****************************************************************************/ +/* Core Loader Side of DSBT Support */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* DLOAD_get_dsbt_size() */ +/* */ +/* Query the size of the DSBT associated with a specified file. The */ +/* client will check the size of a module's DSBT before it writes a copy */ +/* of the master DSBT to the module's DSBT. If the module's DSBT is not */ +/* big enough, an error will be emitted and the load will fail. */ +/* */ +/*---------------------------------------------------------------------------*/ +uint32_t DLOAD_get_dsbt_size(DLOAD_HANDLE handle, int32_t file_handle); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_get_dsbt_base() */ +/* */ +/* Find DSBT address for specified file. The client will query for this */ +/* address after allocation and symbol relocation has been completed. */ +/* The client will write a copy of the master DSBT to the returned DSBT */ +/* address if the module's DSBT size is big enough. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_get_dsbt_base(DLOAD_HANDLE handle, int32_t file_handle, + TARGET_ADDRESS *dsbt_base); + +/*---------------------------------------------------------------------------*/ +/* DLOAD_get_static_base() */ +/* */ +/* Find static base for a specified file. The client will query for this */ +/* address after allocation and symbol relocation has been completed. */ +/* The client will use the returned static base value to fill the slot */ +/* in the master DSBT that is associated with this module. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLOAD_get_static_base(DLOAD_HANDLE handle, int32_t file_handle, + TARGET_ADDRESS *static_base); + + +/*****************************************************************************/ +/* Client Side of DSBT Support */ +/*****************************************************************************/ + +/*---------------------------------------------------------------------------*/ +/* DLIF_register_dsbt_index_request() */ +/* */ +/* Register a request for a DSBT index with the client. A module can */ +/* make a specific DSBT index request or it can allow the client to */ +/* assign a DSBT index on its behalf (requested_dsbt_index == -1). The */ +/* client implementation of this function must check that a specific DSBT */ +/* index request does not conflict with a previous specific DSBT index */ +/* request. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_register_dsbt_index_request(DLOAD_HANDLE handle, + const char *requestor_name, + int32_t requestor_file_handle, + int32_t requested_dsbt_index); + +/*---------------------------------------------------------------------------*/ +/* DLIF_assign_dsbt_indices() */ +/* */ +/* Bind each module that registered a request for a DSBT index to a */ +/* specific slot in the DSBT. Specific requests for DSBT indices will be */ +/* honored first. Any general requests that remain will be assigned to */ +/* the first available slot in the DSBT. */ +/* */ +/*---------------------------------------------------------------------------*/ +void DLIF_assign_dsbt_indices(void); + +/*---------------------------------------------------------------------------*/ +/* DLIF_get_dsbt_index() */ +/* */ +/* Given a module that uses the DSBT model, return the identity of the */ +/* DSBT slot that was assigned to it by the client. This function can */ +/* only be called after the client has assigned DSBT indices to all */ +/* loaded object modules that use the DSBT model. The implementation of */ +/* this function will check that a proper DSBT index has been assigned to */ +/* the specified module and an invalid index (-1) if there is a problem. */ +/* */ +/*---------------------------------------------------------------------------*/ +int32_t DLIF_get_dsbt_index(int32_t file_handle); + +/*---------------------------------------------------------------------------*/ +/* DLIF_update_all_dsbts() */ +/* */ +/* Populate the client's model of the master DSBT with the static base */ +/* for each assigned slot in the DSBT, then write a copy of the master */ +/* DSBT to each module's DSBT location. The implementation of this */ +/* function must check the size of each module's DSBT to make sure that */ +/* it is large enough to hold a copy of the master DSBT. The function */ +/* will return FALSE if there is a problem. */ +/* */ +/*---------------------------------------------------------------------------*/ +BOOL DLIF_update_all_dsbts(void); + +#endif diff --git a/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c b/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c new file mode 100644 index 0000000..fbcdbeb --- /dev/null +++ b/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c @@ -0,0 +1,417 @@ +/* +* symtab.c +* +* Symbol table creation, maintenance, and management. This module also +* contains implementations of local and global symbol table lookup +* algorithms, as appropriate for the platform that we are running on +* (assumed to be DSP Bridge or Linux model, indicated by +* direct_dependent_only flag in a given Module). +* +* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include "elf32.h" +#include "ArrayList.h" + +/*---------------------------------------------------------------------------*/ +/* Set up a Queue of Int32 type data objects. */ +/*---------------------------------------------------------------------------*/ +#include "Queue.h" +TYPE_QUEUE_DEFINITION(int32_t, Int32) +TYPE_QUEUE_IMPLEMENTATION(int32_t, Int32) + +#include "symtab.h" +#include "dload_api.h" +#include <string.h> + +/*---------------------------------------------------------------------------*/ +/* Holds the handle of the ET_EXEC-type mmodule loaded, if any. */ +/*---------------------------------------------------------------------------*/ +int32_t DLIMP_application_handle = 0; + +/*---------------------------------------------------------------------------*/ +/* Function prototypes */ +/*---------------------------------------------------------------------------*/ +BOOL DLSYM_lookup_global_symtab(const char *sym_name, struct Elf32_Sym *symtab, + Elf32_Word symnum, Elf32_Addr *sym_value); + +/*****************************************************************************/ +/* DLSYM_COPY_GLOBALS() - Copy global symbols from the dynamic module's */ +/* symbol table to the loader's global symbol table. */ +/*****************************************************************************/ +void DLSYM_copy_globals(DLIMP_Dynamic_Module *dyn_module) +{ + Elf32_Word i, global_index, global_symnum; + DLIMP_Loaded_Module *module = dyn_module->loaded_module; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("DLSYM_copy_globals:\n"); +#endif + + /*------------------------------------------------------------------------*/ + /* The dynamic symbol table is sorted so that the local symbols come */ + /* before the global symbols. gsymtab_offset points to the address where */ + /* the first global symbol starts. Only the global symbols need to be */ + /* copied into the persistent info. */ + /*------------------------------------------------------------------------*/ + global_index = dyn_module->gsymtab_offset / sizeof(struct Elf32_Sym); + global_symnum = dyn_module->symnum - global_index; + + /*------------------------------------------------------------------------*/ + /* Create space for the new global symbol table. */ + /*------------------------------------------------------------------------*/ + if (module->gsymtab) + { + DLIF_free(module->gsymtab); + module->gsymtab = NULL; + } + + if (global_symnum > 0) + { + module->gsymtab = DLIF_malloc(sizeof(struct Elf32_Sym) * global_symnum); + + memcpy(module->gsymtab, + &dyn_module->symtab[global_index], + sizeof(struct Elf32_Sym) * global_symnum); + } + module->gsymnum = global_symnum; + + /*------------------------------------------------------------------------*/ + /* Copy the string table part that contains the global symbol names. */ + /*------------------------------------------------------------------------*/ + if (module->gstrtab) + { + DLIF_free(module->gstrtab); + module->gstrtab = NULL; + } + + module->gstrsz = dyn_module->strsz - dyn_module->gstrtab_offset; + if (module->gstrsz) + { + module->gstrtab = DLIF_malloc(module->gstrsz); + + memcpy(module->gstrtab, + dyn_module->strtab + dyn_module->gstrtab_offset, + module->gstrsz); + } + + /*------------------------------------------------------------------------*/ + /* Update the symbol names of the global symbol entries to point to */ + /* the symbol names in the string table. */ + /* NOTE: Note that we don't set the offset into the string table. We */ + /* instead set the full address so that the st_name field can be accessed */ + /* as char *. */ + /*------------------------------------------------------------------------*/ + for (i = 0; i < global_symnum; i++) + { + + Elf32_Word old_offset = dyn_module->symtab[i + global_index].st_name - + (Elf32_Addr) dyn_module->strtab; + Elf32_Word new_offset = old_offset - dyn_module->gstrtab_offset; + struct Elf32_Sym *sym = &((struct Elf32_Sym*)(module->gsymtab))[i]; + sym->st_name = new_offset + (Elf32_Addr)module->gstrtab; + +#if LOADER_DEBUG + if (debugging_on) DLIF_trace("Copying symbol: %s\n", + (char*)dyn_module->symtab[i + global_index].st_name); +#endif + } +} + +/*****************************************************************************/ +/* BREADTH_FIRST_LOOKUP() - Perform a breadth-first search of the Module */ +/* dependency graph to find specified symbol name (sym_name). */ +/*****************************************************************************/ +static BOOL breadth_first_lookup(DLOAD_HANDLE phandle, + const char* sym_name, + int handle, + Elf32_Addr *sym_value) +{ + /*------------------------------------------------------------------------*/ + /* We start this function by putting the specified file handle on the */ + /* file_handle_queue. */ + /*------------------------------------------------------------------------*/ + LOADER_OBJECT *dHandle = (LOADER_OBJECT *)phandle; + Int32_Queue file_handle_queue = TYPE_QUEUE_INITIALIZER; + Int32_enqueue(&file_handle_queue, handle); + + /*------------------------------------------------------------------------*/ + /* While the queue is not empty, keep looking for the symbol. */ + /*------------------------------------------------------------------------*/ + while(file_handle_queue.size) + { + int i; + + /*---------------------------------------------------------------------*/ + /* Set up a pointer to front of the list of loaded files so that we */ + /* can be sure that dependent files will be searched in load order. */ + /*---------------------------------------------------------------------*/ + loaded_module_ptr_Queue_Node* mod_node = + dHandle->DLIMP_loaded_objects.front_ptr; + int* dependencies = (int*)(mod_node->value->dependencies.buf); + + /*---------------------------------------------------------------------*/ + /* Pluck off the file handle at the front of the file_handle_queue. */ + /* We will search this file next. */ + /*---------------------------------------------------------------------*/ + handle = Int32_dequeue(&file_handle_queue); + + /*---------------------------------------------------------------------*/ + /* Locate the Module associated with the current file handle. */ + /*---------------------------------------------------------------------*/ + while (mod_node->value->file_handle != handle) mod_node++; + + /*---------------------------------------------------------------------*/ + /* Search the symbol table of the current file handle's Module. */ + /* If the symbol was found, then we're finished. */ + /*---------------------------------------------------------------------*/ + if (DLSYM_lookup_global_symtab(sym_name, + mod_node->value->gsymtab, + mod_node->value->gsymnum, + sym_value)) + return TRUE; + + /*---------------------------------------------------------------------*/ + /* If our symbol was not in the current Module, then add this Module's */ + /* dependents to the end of the file_handle_queue. */ + /*---------------------------------------------------------------------*/ + for (i = 0; i < mod_node->value->dependencies.size; i++) + Int32_enqueue(&file_handle_queue, dependencies[i]); + } + + /*------------------------------------------------------------------------*/ + /* We didn't find our symbol; return FALSE. */ + /*------------------------------------------------------------------------*/ + return FALSE; +} + +/*****************************************************************************/ +/* DLSYM_global_lookup() - Search the global symbol table to find the */ +/* definition of the given symbol name. */ +/*****************************************************************************/ +BOOL DLSYM_global_lookup(DLOAD_HANDLE handle, + const char *sym_name, + DLIMP_Loaded_Module *loaded_module, + Elf32_Addr *sym_value) +{ + int i = 0; + loaded_module_ptr_Queue_Node* node; + LOADER_OBJECT *dHandle = (LOADER_OBJECT *)handle; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("DLSYM_global_lookup: %s\n", sym_name); +#endif + + /*------------------------------------------------------------------------*/ + /* We will choose a different lookup algorithm based on what kind of */ + /* platform we are supporting. In the Braveheart case, the global symbol */ + /* lookup algorithm searches the base image first, followed by the */ + /* explicit children of the specified Module. */ + /*------------------------------------------------------------------------*/ + if (loaded_module->direct_dependent_only) + { + int* child_handle = (int*)(loaded_module->dependencies.buf); + + /*---------------------------------------------------------------------*/ + /* Spin through list of this Module's dependencies (anything on its */ + /* DT_NEEDED list), searching through each dependent's symbol table */ + /* to find the symbol we are after. */ + /*---------------------------------------------------------------------*/ + for (i = 0; i < loaded_module->dependencies.size; i++) + { + for (node = dHandle->DLIMP_loaded_objects.front_ptr; + node->value->file_handle != child_handle[i]; + node=node->next_ptr); + + /*------------------------------------------------------------------*/ + /* Return true if we find the symbol. */ + /*------------------------------------------------------------------*/ + if (DLSYM_lookup_global_symtab(sym_name, + node->value->gsymtab, + node->value->gsymnum, + sym_value)) + return TRUE; + } + } + + /*------------------------------------------------------------------------*/ + /* In the LINUX model, we will use a breadth-first global symbol lookup */ + /* algorithm. First, the application's global symbol table is searched, */ + /* followed by its children, followed by their children, and so on. */ + /* It is up to the client of this module to set the application handle. */ + /*------------------------------------------------------------------------*/ + else + { + if (breadth_first_lookup(handle, sym_name, DLIMP_application_handle, + sym_value)) + return TRUE; + } + + /*------------------------------------------------------------------------*/ + /* If we got this far, then symbol was not found. */ + /*------------------------------------------------------------------------*/ + DLIF_error(DLET_SYMBOL, "Could not resolve symbol %s!\n", sym_name); + + return FALSE; +} + +/*****************************************************************************/ +/* DLSYM_lookup_symtab() - Lookup the symbol name in the given symbol table. */ +/* Symbol must have specified binding. Return the */ +/* value in sym_value and return TRUE if the lookup */ +/* succeeds. */ +/*****************************************************************************/ +static BOOL DLSYM_lookup_symtab(const char *sym_name, struct Elf32_Sym *symtab, + Elf32_Word symnum, Elf32_Addr *sym_value, + BOOL require_local_binding) +{ + Elf32_Addr sym_idx; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("DLSYM_lookup_symtab, sym to find : %s\n", sym_name); +#endif + + for (sym_idx = 0; sym_idx < symnum; sym_idx++) + { +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("\tPotential symbol match : %s\n", + (char*)symtab[sym_idx].st_name); +#endif + + if ((symtab[sym_idx].st_shndx != SHN_UNDEF) && ((require_local_binding && + (ELF32_ST_BIND(symtab[sym_idx].st_info) == STB_LOCAL)) || + (!require_local_binding && + (ELF32_ST_BIND(symtab[sym_idx].st_info) != STB_LOCAL))) && + !strcmp(sym_name,(char*)(symtab[sym_idx].st_name))) + { + if (sym_value) *sym_value = symtab[sym_idx].st_value; + return TRUE; + } + } + if (sym_value) *sym_value = 0; + return FALSE; +} + +/*****************************************************************************/ +/* DLSYM_lookup_global_symtab() - Lookup the symbol name in the given symbol */ +/* table. Symbol must have global binding. */ +/* Return the value in sym_value and return */ +/* TRUE if the lookup succeeds. */ +/*****************************************************************************/ +BOOL DLSYM_lookup_global_symtab(const char *sym_name, struct Elf32_Sym *symtab, + Elf32_Word symnum, Elf32_Addr *sym_value) +{ + return DLSYM_lookup_symtab(sym_name, symtab, symnum, sym_value, FALSE); +} + +/*****************************************************************************/ +/* DLSYM_lookup_local_symtab() - Lookup the symbol name in the given symbol */ +/* table. Symbol must have local binding. */ +/* Return the value in sym_value and return */ +/* TRUE if the lookup succeeds. */ +/*****************************************************************************/ +BOOL DLSYM_lookup_local_symtab(const char *sym_name, struct Elf32_Sym *symtab, + Elf32_Word symnum, Elf32_Addr *sym_value) +{ + return DLSYM_lookup_symtab(sym_name, symtab, symnum, sym_value, TRUE); +} + +/*****************************************************************************/ +/* CANONICAL_SYMBOL_LOOKUP() - Find the symbol definition. Look up the local */ +/* symbol table to find the symbol. If it is a */ +/* definition and cannot be pre-empted, return */ +/* it. Otherwise, do a look up in the global */ +/* symbol table that contains the symbol tables */ +/* from all the necessary modules. */ +/*****************************************************************************/ +BOOL DLSYM_canonical_lookup(DLOAD_HANDLE handle, int sym_index, + DLIMP_Dynamic_Module *dyn_module, + Elf32_Addr *sym_value) +{ + /*------------------------------------------------------------------------*/ + /* Lookup the symbol table to get the symbol characteristics. */ + /*------------------------------------------------------------------------*/ + struct Elf32_Sym *sym = &dyn_module->symtab[sym_index]; + int32_t st_bind = ELF32_ST_BIND(sym->st_info); + int32_t st_vis = ELF32_ST_VISIBILITY(sym->st_other); + BOOL is_def = (sym->st_shndx != SHN_UNDEF && + (sym->st_shndx < SHN_LORESERVE || + sym->st_shndx == SHN_ABS || + sym->st_shndx == SHN_COMMON || + sym->st_shndx == SHN_XINDEX)); + const char *sym_name = (char *)sym->st_name; + +#if LOADER_DEBUG + if (debugging_on) + DLIF_trace("DLSYM_canonical_lookup: %d, %s\n", sym_index, sym_name); +#endif + + /*------------------------------------------------------------------------*/ + /* Local symbols and symbol definitions that cannot be pre-empted */ + /* are resolved by the definition in the same module. */ + /*------------------------------------------------------------------------*/ + if (st_bind == STB_LOCAL || st_vis != STV_DEFAULT) + { + /*---------------------------------------------------------------------*/ + /* If it is a local symbol or non-local that cannot be preempted, */ + /* the definition should be found in the same module. If we don't */ + /* find the definition it is an error. */ + /*---------------------------------------------------------------------*/ + if (!is_def) + { + DLIF_error(DLET_SYMBOL, + "Local/non-imported symbol %s definition is not found " + "in module %s!\n", sym_name, dyn_module->name); + return FALSE; + } + else + { + if (sym_value) *sym_value = sym->st_value; + return TRUE; + } + } + /*------------------------------------------------------------------------*/ + /* Else we have either pre-emptable defintion or undef symbol. We need */ + /* to do global look up. */ + /*------------------------------------------------------------------------*/ + else + { + return DLSYM_global_lookup(handle, sym_name, dyn_module->loaded_module, + sym_value); + } +} + diff --git a/src/core/dsp/ocl_load/README b/src/core/dsp/ocl_load/README new file mode 100644 index 0000000..19165f6 --- /dev/null +++ b/src/core/dsp/ocl_load/README @@ -0,0 +1,8 @@ + +This program is dependent on these Standard CVS modules + +C60_DLOAD_DYN: +C60_DLOAD_REL: +DLOAD: +DLOAD_API: +DLOAD_SYM: diff --git a/src/core/dsp/ocl_load/Stack.h b/src/core/dsp/ocl_load/Stack.h new file mode 100644 index 0000000..e958674 --- /dev/null +++ b/src/core/dsp/ocl_load/Stack.h @@ -0,0 +1,182 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +/* +* Stack.h +* +* Interface to Stack +* ------------------ +* +* This is an implementation of a type-independent stack implemented as +* a signly linked list class for C. It's basically a template class, but +* uses macros instead, so that it can be compiled with a C-only compiler. +* +* To define a Stack class: +* #include "Stack.h" +* TYPE_STACK_DEFINITION(object_type,Class_Identifier) +* +* In a separate C file: +* #include "Stack.h" +* TYPE_STACK_DEFINITION(object_type,Class_Identifier) +* TYPE_STACK_IMPLEMENTATION(object_type,Class_Identifier) +* +* Now, to create a stack: +* struct Class_Identifier_Stack name; +* Get it initialized to zero everywhere somehow, maybe like this: +* initialize_stack_Class_Identifier(&name); +* +* To add to the stack: +* push_Class_Identifier(&name, object); +* +* To access the top of the stack: +* Class_Identifier_Stack_Node *tos = name.top_ptr; +* do_something_to_(tos->value); +* +* To delete from the stack: +* if (name.size > 0) pop_Class_Identifier(&name); +* +* Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/ +* +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* Neither the name of Texas Instruments Incorporated nor the names of +* its contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#ifndef STACK_H +#define STACK_H + +#include <inttypes.h> +#include "dload_api.h" + +/*****************************************************************************/ +/* TYPE_STACK_DEFINITION() - Define structure specifications for a last-in, */ +/* first-out linked list of t_name objects. */ +/*****************************************************************************/ +#define TYPE_STACK_DEFINITION(t, t_name) \ +struct t_name##_Stack_Node_ \ +{ \ + t value; \ + struct t_name##_Stack_Node_* next_ptr; \ +}; \ +typedef struct t_name##_Stack_Node_ t_name##_Stack_Node; \ + \ +typedef struct \ +{ \ + t_name##_Stack_Node* top_ptr; \ + t_name##_Stack_Node* bottom_ptr; \ + int size; \ +} t_name##_Stack; \ + \ +extern void t_name##_initialize_stack(t_name##_Stack* stack); \ +extern void t_name##_push(t_name##_Stack* stack, t to_push); \ +extern t t_name##_pop(t_name##_Stack* stack); + +/*****************************************************************************/ +/* TYPE_STACK_DEFINITION() - Define the initializer to initalize Stacks. */ +/*****************************************************************************/ +#define TYPE_STACK_INITIALIZER {NULL, NULL, 0 } + +/*****************************************************************************/ +/* TYPE_STACK_IMPLEMENTATION() - Define member functions of new LIFO linked */ +/* list "class" of t_name objects. */ +/* */ +/* <type>_initialize_stack() - clears the stack */ +/* <type>_push() - pushes a <t> type object to the top of the stack */ +/* <type>_pop() - pop a <t> type object from the top of the stack */ +/* and provide access to it to the caller */ +/*****************************************************************************/ +#define TYPE_STACK_IMPLEMENTATION(t, t_name) \ +void t_name##_initialize_stack (t_name##_Stack* stack) \ +{ \ + stack->top_ptr = stack->bottom_ptr = NULL; \ + stack->size = 0; \ +} \ +void t_name##_push(t_name##_Stack* stack, t to_push) \ +{ \ + stack->size++; \ + \ + if(!stack->top_ptr) \ + { \ + stack->bottom_ptr = stack->top_ptr = \ + (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \ + stack->top_ptr->next_ptr = NULL; \ + } \ + else \ + { \ + t_name##_Stack_Node* next_ptr = stack->top_ptr; \ + stack->top_ptr = \ + (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \ + stack->top_ptr->next_ptr = next_ptr; \ + } \ + \ + stack->top_ptr->value = to_push; \ +} \ + \ +t t_name##_pop(t_name##_Stack* stack) \ +{ \ + t to_ret; \ + t_name##_Stack_Node* next_ptr = stack->top_ptr->next_ptr; \ + \ + stack->size--; \ + to_ret = stack->top_ptr->value; \ + DLIF_free((void*)(stack->top_ptr)); \ + \ + if(!stack->size) \ + stack->top_ptr = stack->bottom_ptr = NULL; \ + else \ + stack->top_ptr = next_ptr; \ + \ + return to_ret; \ +} + +#endif diff --git a/src/core/dsp/ocl_load/ocl_load.c b/src/core/dsp/ocl_load/ocl_load.c new file mode 100644 index 0000000..c53a137 --- /dev/null +++ b/src/core/dsp/ocl_load/ocl_load.c @@ -0,0 +1,139 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include "dload_api.h" + +#define TYPE_STACK_DEFINITION(t, t_name) +#define TYPE_STACK_IMPLEMENTATION(t, t_name) + +int debugging_on = FALSE; +int profiling_on = FALSE; + +int global_argc; +char **global_argv; + +int DLIF_fseek(LOADER_FILE_DESC *stream, int32_t offset, int origin) + { return fseek(stream, offset, origin); } + + +size_t DLIF_fread(void *ptr, size_t size, size_t nmemb, + LOADER_FILE_DESC *stream) + { return fread(ptr, size, nmemb, stream); } + +int32_t DLIF_ftell (LOADER_FILE_DESC *stream) { return ftell(stream); } +int32_t DLIF_fclose(LOADER_FILE_DESC *fd) { return fclose(fd); } +void* DLIF_malloc(size_t size) { return malloc(size); } +void DLIF_free (void* ptr) { free(ptr); } + +/*****************************************************************************/ +/* DLIF_COPY() - Copy data from file to host-accessible memory. */ +/* Returns a host pointer to the data in the host_address field of the */ +/* DLOAD_MEMORY_REQUEST object. */ +/*****************************************************************************/ +BOOL DLIF_copy(void* client_handle, struct DLOAD_MEMORY_REQUEST* targ_req) +{ + struct DLOAD_MEMORY_SEGMENT* obj_desc = targ_req->segment; + LOADER_FILE_DESC* f = targ_req->fp; + void *buf = calloc(obj_desc->memsz_in_bytes, 1); + + fseek(f, targ_req->offset, SEEK_SET); + + int result = 1; + if (obj_desc->objsz_in_bytes) + result = fread(buf, obj_desc->objsz_in_bytes, 1, f); + + assert(result == 1); + + targ_req->host_address = buf; + + return 1; +} + +BOOL DLIF_read(void* client_handle, + void *ptr, size_t size, size_t nmemb, TARGET_ADDRESS src) + { assert(0); } + +BOOL DLIF_memcpy(void* client_handle, + void *to, void *from, size_t size) + { return (!memcpy(to, from, size)) ? 0 : 1; } + +int32_t DLIF_execute(void* client_handle, + TARGET_ADDRESS exec_addr) { assert(0); return 1; } + + + + +BOOL DLIF_register_dsbt_index_request(DLOAD_HANDLE handle, + const char *requestor_name, + int32_t requestor_file_handle, + int32_t requested_dsbt_index) + { assert(0); } + +void DLIF_assign_dsbt_indices(void) { assert(0); } + +int32_t DLIF_get_dsbt_index(int32_t file_handle) + { assert(0); return DSBT_INDEX_INVALID; } + +BOOL DLIF_update_all_dsbts() { assert(0); return TRUE; } + +void DLIF_warning(LOADER_WARNING_TYPE wtype, const char *fmt, ...) +{ + va_list ap; + va_start(ap,fmt); + printf("<< D L O A D >> WARNING: "); + vprintf(fmt,ap); + va_end(ap); +} + +void DLIF_error(LOADER_ERROR_TYPE etype, const char *fmt, ...) +{ + va_list ap; + va_start(ap,fmt); + printf("<< D L O A D >> ERROR: "); + vprintf(fmt,ap); + va_end(ap); +} + +void DLIF_trace(const char *fmt, ...) +{ + va_list ap; + va_start(ap,fmt); + vprintf(fmt,ap); + va_end(ap); +} + +void DLIF_exit(ecode) +{ + exit(ecode); +} + diff --git a/src/core/dsp/program.cpp b/src/core/dsp/program.cpp new file mode 100644 index 0000000..6495ec9 --- /dev/null +++ b/src/core/dsp/program.cpp @@ -0,0 +1,633 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "program.h" +#include "device.h" +#include "kernel.h" + +#include "../program.h" + +#include <llvm/PassManager.h> +#include <llvm/Analysis/Passes.h> +#include <llvm/Analysis/Verifier.h> +#include <llvm/Transforms/Scalar.h> +#include <llvm/Transforms/IPO.h> +#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h> +#include <llvm/Support/raw_ostream.h> +#include <llvm/Bitcode/ReaderWriter.h> +#include "wga.h" + +#include <llvm/LinkAllPasses.h> +#include <WorkitemHandlerChooser.h> +#include <BreakConstantGEPs.h> +#include <Flatten.h> +#include <PHIsToAllocas.h> +#include <IsolateRegions.h> +#include <VariableUniformityAnalysis.h> +#include <ImplicitLoopBarriers.h> +#include <LoopBarriers.h> +#include <BarrierTailReplication.h> +#include <CanonicalizeBarriers.h> +#include <WorkItemAliasAnalysis.h> +#include <WorkitemReplication.h> +#include <WorkitemLoops.h> +#include <AllocasToEntry.h> +#include <Workgroup.h> +#include <TargetAddressSpaces.h> + +#include <string> +#include <iostream> +#include <fstream> +#include <sstream> +#include <stdio.h> +#include <stdlib.h> +#include <vector> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <elf.h> + +#include "genfile_cache.h" + +genfile_cache * genfile_cache::pInstance = 0; + +timespec getTime() +{ + struct timespec tp; + if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0) + clock_gettime(CLOCK_REALTIME, &tp); + return tp; +} + +double ts_to_double(const timespec &t) + { return ((double)t.tv_nsec) /1000000000.0 + (double)t.tv_sec; } + +double tsdiff (const timespec& start, const timespec& end) + { return ts_to_double(end) - ts_to_double(start); } + + +using namespace Coal; + +DSPProgram::DSPProgram(DSPDevice *device, Program *program) +: DeviceProgram(), p_device(device), p_program(program), p_program_handle(-1), p_loaded(false), p_keep_files(false), + p_cache_kernels(true) +{ + char *keep = getenv("TI_OCL_KEEP_FILES"); + if (keep) p_keep_files = true; + + char *cache = getenv("TI_OCL_CACHE_KERNELS_OFF"); + if (cache) p_cache_kernels = false; +} + +DSPProgram::~DSPProgram() +{ + p_device->unload(p_program_handle); + if (!p_keep_files && !p_cache_kernels) unlink(p_outfile); +} + +DSPProgram::segment_list *segments; + +bool DSPProgram::load() +{ + segments = &p_segments_written; + + p_program_handle = p_device->load(p_outfile); + if (!p_program_handle) return false; + + segments = NULL; + p_loaded = true; + + char *debug_kernel = getenv("TI_OCL_DEBUG_KERNEL"); + + /*------------------------------------------------------------------------- + * ensure that the newly populated areas are not stale in device caches + *------------------------------------------------------------------------*/ + Msg_t msg; + int segNum = p_segments_written.size(); + + assert(segNum <= MAX_FLUSH_BUF_SIZE/2); + + msg.command = CACHEINV; + msg.u.k.flush.numBuffers = segNum; + msg.u.k.flush.num_mpaxs = 0; + for (int i=0; i < segNum; ++i) + { + msg.u.k.flush.buffers[2*i] = p_segments_written[i].ptr; + msg.u.k.flush.buffers[2*i+1] = p_segments_written[i].size; + + uint32_t flags = p_segments_written[i].flags & + (DLOAD_SF_executable | DLOAD_SF_writable); + + const char *seg_desc; + switch (flags) + { + case 0: seg_desc = "Read Only"; break; + case DLOAD_SF_executable: seg_desc = "Executable"; break; + case DLOAD_SF_writable: seg_desc = "Writable"; break; + default: seg_desc = "Writable & Executable"; break; + } + + if (debug_kernel) + printf("%s segment loaded to 0x%08x with size 0x%x\n", + seg_desc, p_segments_written[i].ptr, p_segments_written[i].size); + } + + /*------------------------------------------------------------------------- + * Send the command and wait for the ready response. + *------------------------------------------------------------------------*/ + p_device->mail_to(msg); + + /*------------------------------------------------------------------------- + * We do not wait here. The wait will be handled by the standard wait loop + * int the worker thread. + *------------------------------------------------------------------------*/ + return true; +} + +bool DSPProgram::is_loaded() const +{ + return p_loaded; +} + +bool DSPProgram::linkStdLib() const +{ + return false; +} + +const char* DSPProgram::outfile_name() const +{ + return p_outfile; +} + +DSPDevicePtr DSPProgram::data_page_ptr() +{ + DSPDevicePtr p; + + if (!is_loaded()) load(); + + DLOAD_get_static_base(p_device->dload_handle(), p_program_handle, &p); + return p; +} + +void DSPProgram::createOptimizationPasses(llvm::PassManager *manager, + bool optimize, bool hasBarrier) +{ + if (hasBarrier) + { + manager->add(new llvm::DominatorTree()); + manager->add(new pocl::WorkitemHandlerChooser()); + manager->add(new BreakConstantGEPs()); // from pocl + // add(new GenerateHeader()); // no need + manager->add(new pocl::Flatten()); + manager->add( llvm::createAlwaysInlinerPass()); + manager->add( llvm::createGlobalDCEPass()); + manager->add( llvm::createCFGSimplificationPass()); + manager->add( llvm::createLoopSimplifyPass()); + manager->add(new pocl::PHIsToAllocas()); + manager->add( llvm::createRegionInfoPass()); + manager->add(new pocl::IsolateRegions()); + manager->add(new pocl::VariableUniformityAnalysis()); // TODO + manager->add(new pocl::ImplicitLoopBarriers()); + manager->add(new pocl::LoopBarriers()); + manager->add(new pocl::BarrierTailReplication()); + manager->add(new pocl::CanonicalizeBarriers()); + manager->add(new pocl::IsolateRegions()); + manager->add(new pocl::WorkItemAliasAnalysis()); + // add(new pocl::WorkitemReplication()); // no need + manager->add(new pocl::WorkitemLoops()); + manager->add(new pocl::AllocasToEntry()); + // add(new pocl::Workgroup()); // no need + manager->add(new pocl::TargetAddressSpaces()); + } + + if (optimize) + { + /* + * Inspired by code from "The LLVM Compiler Infrastructure" + */ + manager->add(llvm::createDeadArgEliminationPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createFunctionInliningPass()); + manager->add(llvm::createPruneEHPass()); // Remove dead EH info. + manager->add(llvm::createGlobalOptimizerPass()); + manager->add(llvm::createGlobalDCEPass()); // Remove dead functions. + manager->add(llvm::createArgumentPromotionPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createJumpThreadingPass()); + + //ASW TODO maybe turn off re: pete. might gen bad xlator input + //manager->add(llvm::createScalarReplAggregatesPass()); + + manager->add(llvm::createFunctionAttrsPass()); // Add nocapture. + manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis. + manager->add(llvm::createLICMPass()); // Hoist loop invariants. + manager->add(llvm::createGVNPass()); // Remove redundancies. + manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys. + manager->add(llvm::createDeadStoreEliminationPass()); + manager->add(llvm::createInstructionCombiningPass()); + manager->add(llvm::createJumpThreadingPass()); + manager->add(llvm::createCFGSimplificationPass()); + } + + manager->add(llvm::createUnifyFunctionExitNodesPass()); + manager->add(llvm::createTIOpenclWorkGroupAggregationPass(hasBarrier)); + + /*------------------------------------------------------------------------- + * Borrow the pocl alloca hoister for the TI simplistic WGA pass as well + *------------------------------------------------------------------------*/ + if (!hasBarrier) + manager->add(new pocl::AllocasToEntry()); +} + + +std::string process_cl6x_options(std::string options) +{ + std::istringstream options_stream(options); + std::string token; + std::string result; + + while (options_stream >> token) + { + if ((token.find(".obj") != std::string::npos) || + (token.find(".dll") != std::string::npos) || + (token.find(".ae66") != std::string::npos) || + (token.find(".a66") != std::string::npos) || + (token.find(".out") != std::string::npos) || + (token.find(".lib") != std::string::npos) || + (token.find(".o") != std::string::npos) || + (token.find(".o66") != std::string::npos) || + (token.find(".oe66") != std::string::npos) || + (token.find(".a") != std::string::npos) || + (token.find(".cmd") != std::string::npos)) + result += token + " "; + } + return result; +} + +/****************************************************************************** +* Find the C6000 CGT installation +******************************************************************************/ +char *get_cgt_install() +{ + char *install = getenv("TI_OCL_CGT_INSTALL"); + if (!install) + { + std::cout << + "The environment variable TI_OCL_CGT_INSTALL must be set to a " + << std::endl << + "directory path where the C6000 compiler tools are installed. " + << std::endl; + + abort(); + } + + return install; +} + +/****************************************************************************** +* Find the OpenCL installation +******************************************************************************/ +char *get_ocl_install() +{ + char *install = getenv("TI_OCL_INSTALL"); + if (!install) + { + std::cout << + "The environment variable TI_OCL_INSTALL must be set to a " + << std::endl << + "directory path where the TI OpenCL product is installed. " + << std::endl; + + abort(); + } + + return install; +} + +std::string get_ocl_dsp() +{ + static std::string sinstall; + + if (sinstall.empty()) + { + struct stat st; + const char *stdpath = "/usr/share/ti/opencl/dsp"; + if (stat(stdpath, &st) == 0) + sinstall = string(stdpath); + else sinstall = string(get_ocl_install()) + "/dsp"; + } + + return sinstall; +} + +/****************************************************************************** +* run_cl6x +******************************************************************************/ +static int run_cl6x(char *filename, std::string *llvm_bitcode, + bool keep_files, std::string options) +{ + std::string command("cl6x --f -q --abi=eabi --use_g3 -mv6600 -mt -mo " + "-ft=/tmp -fs=/tmp -fr=/tmp "); + + if (keep_files) command += "-mw -k --z "; + + /*------------------------------------------------------------------------- + * Turned off for now to workaround a timing bug. Plan to re-enable later + *------------------------------------------------------------------------*/ + command += "--disable:sploop "; + + char *cl6x_debug = getenv("TI_OCL_CL6X_DEBUG"); + + if (cl6x_debug) command += "-g -o0 "; + else command += "-o3 "; + + char *no_sp = getenv("TI_OCL_SOFTWARE_PIPELINE_OFF"); + if (no_sp) command += "-mu "; + + char *cgt_install = get_cgt_install(); + + command += "-I"; command += cgt_install; command += "/include "; + command += "-I"; command += cgt_install; command += "/lib "; + command += "-I"; command += get_ocl_dsp().c_str(); command += " "; + + command += "--bc_file="; command += filename; command += " "; + + /*------------------------------------------------------------------------- + * Encode LLVM bitcode as bytes in the .llvmir section of the .asm file + *------------------------------------------------------------------------*/ + if (llvm_bitcode != NULL) + { + char bitasm_name[32]; + strcpy(bitasm_name, filename); + strcat(bitasm_name, "_bc.asm"); + std::ofstream outasmfile(bitasm_name, std::ios::out); + outasmfile << "\t.sect \".llvmir\"\n" << "\t.retain"; + int nbytes = llvm_bitcode->size(); + for (int i = 0; i < nbytes; i++) + if (i % 10 == 0) + outasmfile << "\n\t.byte " << (int) llvm_bitcode->at(i); + else + outasmfile << ", " << (int) llvm_bitcode->at(i); + outasmfile.close(); + + command += bitasm_name; command += " "; + } + + command += "-z -ldsp.syms -o "; + command += filename; command += ".out "; + + if (keep_files) + { command += "-m "; command += filename; command += ".map "; } + + /*------------------------------------------------------------------------- + * Any libraries or object files need to go last to resolve references + *------------------------------------------------------------------------*/ + command += process_cl6x_options(options); + + //timespec t0, t1; + //clock_gettime(CLOCK_MONOTONIC, &t0); + int x = system(command.c_str()); + //clock_gettime(CLOCK_MONOTONIC, &t1); + //printf("cl6x time: %6.4f secs\n", + // (float)t1.tv_sec-t0.tv_sec+(t1.tv_nsec-t0.tv_nsec)/1e9); + + if (!cl6x_debug) + { + std::string strip_command("strip6x "); + strip_command += filename; strip_command += ".out"; + x = system(strip_command.c_str()); + } +} + +/** + * Extract llvm bitcode and native binary from MixedBinary + */ +bool DSPProgram::ExtractMixedBinary(std::string *binary_str, + std::string *bitcode, std::string *native) +{ + if (binary_str == NULL) return false; + if (strncmp(&binary_str->at(0), ELFMAG, SELFMAG) != 0) return false; + + /*------------------------------------------------------------------------- + * Parse ELF file format, extract ".llvmir" section into bitcode + * Valid Assumptions: 1. cl6x only creates 32-bit ELF files (for now) + * 2. cl6x ELF file has the same endianness as the host + *------------------------------------------------------------------------*/ + if (bitcode != NULL) + { + Elf32_Ehdr ehdr; /* memcpy into here to guarantee proper alignment */ + memcpy(&ehdr, & binary_str->at(0), sizeof(Elf32_Ehdr)); + int n_sects = ehdr.e_shnum; + int shoff = ehdr.e_shoff; + int shstr_sect = ehdr.e_shstrndx; + + Elf32_Shdr shdr; /* memcpy into here to guarantee proper alignment */ + int shsize = sizeof(Elf32_Shdr); + memcpy(&shdr, & binary_str->at(shoff + shstr_sect * shsize), shsize); + char *strtab = & binary_str->at(shdr.sh_offset); + + int i; + for (i = 0; i < n_sects; i++) + { + if (i == shstr_sect) continue; + memcpy(&shdr, & binary_str->at(shoff + i * shsize), shsize); + if (strcmp(&strtab[shdr.sh_name], ".llvmir") == 0) break; + } + if (i >= n_sects) return false; + + bitcode->clear(); + bitcode->append(& binary_str->at(shdr.sh_offset), shdr.sh_size); + } + + /*------------------------------------------------------------------------- + * Return the c6x ELF file in binary_str as native binary + *------------------------------------------------------------------------*/ + if (native != NULL) + { + native->clear(); + native->append(*binary_str); + } + + return true; +} + + +/** + * Write native binary into file, create tmporary filename in p_outfile + */ +void DSPProgram::WriteNativeOut(std::string *native) +{ + try + { + char name_out[] = "/tmp/openclXXXXXX"; + int fOutfile = mkstemp(name_out); + strcpy(p_outfile, name_out); + strcat(p_outfile, ".out"); + + std::ofstream outfile(p_outfile, std::ios::out | std::ios::binary); + outfile.write(native->data(), native->size()); + outfile.close(); + close(fOutfile); + } + catch(...) { std::cout << "ERROR: Binary write out failure" << std::endl; } +} + +/** + * Native binary is stored in file, filename in p_outfile + * Input: binary_str contains only the bitcode + * Output: binary_str contains c6x ELF file with bitcode in ".llvmir" section + */ +void DSPProgram::ReadEmbeddedBinary(std::string *binary_str) +{ + if (binary_str == NULL) return; + + int length; + char *buffer = NULL; + + try + { + std::ifstream is; + is.open(p_outfile, std::ios::binary); + is.seekg(0, std::ios::end); + length = is.tellg(); + is.seekg(0, std::ios::beg); + buffer = new char[length]; + is.read(buffer, length); + is.close(); + + binary_str->clear(); + binary_str->append(buffer, length); + delete [] buffer; + } + catch(...) { std::cout << "ERROR: Binary read in failure" << std::endl; } +} + +bool DSPProgram::build(llvm::Module *module, std::string *binary_str) +{ + p_module = module; + + /*------------------------------------------------------------------------ + * The input binary_str could be any of the following: + * 1. Mixed C6x binary embedded with LLVM bitcode, extract C6x native + * binary and return. There is no need to rebuild from LLVM module. + * 2. LLVM bitcode, proceed to the regular build: + * 2.1 return a corresponding cached c6x binary, if found + * 2.2 invoke c6x compiler toolchain, embed LLVM bitcode, build + * In either case, put c6x binary in binary_str when return + *------------------------------------------------------------------------*/ + std::string native; + if (ExtractMixedBinary(binary_str, NULL, &native)) + { + WriteNativeOut(&native); + return true; + } + + if (p_cache_kernels) + { + string cached_outfile = genfile_cache::instance()->lookup + (p_module, p_program->deviceDependentCompilerOptions(p_device)); + + if (!cached_outfile.empty()) + { + strcpy(p_outfile, cached_outfile.c_str()); + ReadEmbeddedBinary(binary_str); + return true; + } + } + + char name_template[] = "/tmp/openclXXXXXX"; + int pFile = mkstemp(name_template); + + strcpy(p_outfile, name_template); + strcat(p_outfile, ".out"); + + if (pFile != -1) + { + if (p_keep_files) + { + //write out the source as well + + std::string filename(name_template); + filename += ".cl"; + std::ofstream out(filename.c_str()); + out << p_program->source(); + out.close(); + } + + llvm::raw_fd_ostream ostream(pFile, false); + llvm::WriteBitcodeToFile(p_module, ostream); + ostream.flush(); + + run_cl6x(name_template, binary_str, p_keep_files, + p_program->deviceDependentCompilerOptions(p_device)); + + if (!p_keep_files) + { + unlink(name_template); + + char objfile[32]; + strcpy(objfile, name_template); + strcat(objfile, ".obj"); + unlink(objfile); + + if (binary_str != NULL) + { + strcpy(objfile, name_template); + strcat(objfile, "_bc.asm"); + unlink(objfile); + + strcpy(objfile, name_template); + strcat(objfile, "_bc.obj"); + unlink(objfile); + } + } + + if (p_cache_kernels) + genfile_cache::instance()->remember(p_outfile, p_module, + p_program->deviceDependentCompilerOptions(p_device)); + + ReadEmbeddedBinary(binary_str); + } + + if (pFile != -1) close(pFile); + + return true; +} + +DSPDevicePtr DSPProgram::query_symbol(const char *symname) +{ + DSPDevicePtr addr; + + bool found = DLOAD_query_symbol(p_device->dload_handle(), p_program_handle, + symname, &addr); + + return (found) ? addr : 0; +} + diff --git a/src/core/dsp/program.h b/src/core/dsp/program.h new file mode 100644 index 0000000..63c1858 --- /dev/null +++ b/src/core/dsp/program.h @@ -0,0 +1,92 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __DSP_PROGRAM_H__ +#define __DSP_PROGRAM_H__ + +#include "device.h" +#include "../deviceinterface.h" +#include <vector> + +namespace llvm +{ + class ExecutionEngine; + class Module; +} + +namespace Coal +{ + +class DSPDevice; +class Program; + +class DSPProgram : public DeviceProgram +{ + public: + struct seg_desc + { + seg_desc(DSPDevicePtr p, int s, uint32_t f) : + ptr(p), size(s), flags(f) {} + DSPDevicePtr ptr; + unsigned size; + uint32_t flags; + }; + + typedef std::vector<seg_desc> segment_list; + + public: + DSPProgram(DSPDevice *device, Program *program); + ~DSPProgram(); + + bool linkStdLib() const; + const char* outfile_name() const; + void createOptimizationPasses(llvm::PassManager *manager, + bool optimize, bool hasBarrier=false); + bool build(llvm::Module *module, std::string *binary_str); + bool ExtractMixedBinary(std::string *binary_str, + std::string *bitcode, std::string *native); + void WriteNativeOut(std::string *native); + void ReadEmbeddedBinary(std::string *binary_str); + + DSPDevicePtr query_symbol(const char *symname); + DSPDevicePtr data_page_ptr(); + bool load(); + bool is_loaded() const; + + private: + DSPDevice *p_device; + Program *p_program; + llvm::Module *p_module; + int p_program_handle; + char p_outfile[32]; + bool p_loaded; + segment_list p_segments_written; + bool p_keep_files; + bool p_cache_kernels; +}; +} +#endif diff --git a/src/core/dsp/shmem.cpp b/src/core/dsp/shmem.cpp new file mode 100644 index 0000000..6aec2f8 --- /dev/null +++ b/src/core/dsp/shmem.cpp @@ -0,0 +1,539 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "shmem.h" + +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <ti/cmem.h> + +#define REPORT(x) printf(x "\n") +#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); } + +/****************************************************************************** +* shmem::shmem +******************************************************************************/ +shmem::shmem() + : p_dsp_addr(0), p_size(0), p_page_size(sysconf(_SC_PAGE_SIZE)), p_mmap_fd(-1) + , p_mpm_transport_handle(NULL) + +{ } + +/****************************************************************************** +* shmem::~shmem +******************************************************************************/ +shmem::~shmem() +{ + if (p_mmap_fd != -1) close(p_mmap_fd); +} + +/****************************************************************************** +* shmem::configure +******************************************************************************/ +void shmem::configure_base(DSPDevicePtr64 dsp_addr, uint64_t size) +{ + /*------------------------------------------------------------------------- + * If the sysconf for the page size failed + *------------------------------------------------------------------------*/ + if (p_page_size <= 0) { REPORT("Failed to get PAGE_SIZE"); return; } + + // p_mmap_fd = open("/dev/mem", (O_RDWR | O_SYNC)); + // Now we use mpm_transport_{open, mmap, munmap, close} + /*------------------------------------------------------------------------- + * core1-core7's l2 go through /dev/dsp{1-7} + * everything else (core0's l2, msmc, global addr) go through /dev/dsp0 + *------------------------------------------------------------------------*/ + char devname[16]; + strcpy(devname, "dsp0"); + if (0x11800000 <= dsp_addr & dsp_addr < 0x17900000) + devname[3] = ((dsp_addr >> 24) - 0x10) + '0'; + mpm_transport_open_t mpm_transport_open_cfg; + mpm_transport_open_cfg.open_mode = (O_SYNC|O_RDWR); + p_mpm_transport_handle = mpm_transport_open(devname, + &mpm_transport_open_cfg); + + /*------------------------------------------------------------------------- + * If the open failed + *------------------------------------------------------------------------*/ + // if (p_mmap_fd == -1) { REPORT("Failed to open /dev/mem"); return; } + if (p_mpm_transport_handle == NULL) + { + printf("Failed to open /dev/%s", devname); + return; + } + + p_dsp_addr = dsp_addr; + p_size = size; +} + + +/****************************************************************************** +* shmem_persistent::shmem +******************************************************************************/ +#define MULTIPLE_OF_POW2(x, y) (((x) & ((y)-1)) != 0 ? false : true) + +shmem_persistent::shmem_persistent() + : p_host_addr(0), p_xlate_dsp_to_host_offset(0) +{ } + +/****************************************************************************** +* shmem_persistent::configure +******************************************************************************/ +void shmem_persistent::configure(DSPDevicePtr64 dsp_addr, uint64_t size) +{ + configure_base(dsp_addr, size); + + /*------------------------------------------------------------------------- + * if base class failed to construct, because /dev/mem could not be opened + *------------------------------------------------------------------------*/ + // if (p_mmap_fd == -1) return; + if (p_mpm_transport_handle == NULL) return; + + if (!MULTIPLE_OF_POW2(dsp_addr, p_page_size)) + { + REPORT("Mapped region addr is not a multiple of page size"); + return; + } + + if (!MULTIPLE_OF_POW2(size, p_page_size)) + { + REPORT("Mapped region size is not a multiple of page size"); + return; + } + + //p_host_addr = mmap(0, size, (PROT_READ|PROT_WRITE), MAP_SHARED, p_mmap_fd, + // (off_t)dsp_addr); + mpm_transport_mmap_t mpm_transport_mmap_cfg; + mpm_transport_mmap_cfg.mmap_prot = (PROT_READ|PROT_WRITE); + mpm_transport_mmap_cfg.mmap_flags = MAP_SHARED; + + p_host_addr = (void *)mpm_transport_mmap(p_mpm_transport_handle, + dsp_addr, size, + &mpm_transport_mmap_cfg); + + // if (p_host_addr == MAP_FAILED) + if (p_host_addr == (void *) -1) + { + REPORT("Failed to mmap"); + p_host_addr = 0; + return; + } + + p_xlate_dsp_to_host_offset = (void*)((int64_t)p_host_addr - dsp_addr); +} + +/****************************************************************************** +* shmem_persistent::~shmem_persistent +******************************************************************************/ +shmem_persistent::~shmem_persistent() +{ + // if (p_host_addr) munmap(p_host_addr, p_size); + if (p_host_addr) + mpm_transport_munmap(p_mpm_transport_handle, p_host_addr, p_size); +} + +/****************************************************************************** +* shmem_persistent::map +******************************************************************************/ +void *shmem_persistent::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read) +{ + if (!p_host_addr) return 0; + + if (dsp_addr >= p_dsp_addr && dsp_addr + size <= p_dsp_addr + p_size) + return dsp_addr + (char*)p_xlate_dsp_to_host_offset; + else + { + REPORT("Attempting to map a region outside a defined area"); + return 0; + } +} + +/****************************************************************************** +* shmem_persistent::unmap +******************************************************************************/ +void shmem_persistent::unmap(void* host_addr, uint32_t size, bool is_write) +{ + // if (host_addr) msync(host_addr, size, MS_SYNC); +} + + + +/****************************************************************************** +* shmem_ondemand::shmem_ondemap +******************************************************************************/ +shmem_ondemand::shmem_ondemand() +{ } + +/****************************************************************************** +* shmem::~shmem +******************************************************************************/ +shmem_ondemand::~shmem_ondemand() +{ +} + +/****************************************************************************** +* shmem_ondemand::configure +******************************************************************************/ +void shmem_ondemand::configure(DSPDevicePtr64 dsp_addr, uint64_t size) +{ + configure_base(dsp_addr, size); +} + + +/****************************************************************************** +* shmem_ondemand::map +******************************************************************************/ +void *shmem_ondemand::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read) +{ + if (!MULTIPLE_OF_POW2(dsp_addr, p_page_size)) + { + REPORT("Mapped region addr is not a multiple of page size"); + return 0; + } + + if (!MULTIPLE_OF_POW2(size, p_page_size)) + { + REPORT("Mapped region addr is not a multiple of page size"); + return 0; + } + + if (dsp_addr < p_dsp_addr || dsp_addr + size > p_dsp_addr + p_size) + { + REPORT("Attempting to map a region outside a defined area"); + return 0; + } + + //void *host_addr = mmap(0, size, (PROT_READ|PROT_WRITE), MAP_SHARED, + // p_mmap_fd, (off_t)dsp_addr); + mpm_transport_mmap_t mpm_transport_mmap_cfg; + mpm_transport_mmap_cfg.mmap_prot = (PROT_READ|PROT_WRITE); + mpm_transport_mmap_cfg.mmap_flags = MAP_SHARED; + + void * host_addr = mpm_transport_mmap(p_mpm_transport_handle, + dsp_addr, size, + &mpm_transport_mmap_cfg); + + // if (host_addr == MAP_FAILED) + if (host_addr == (void *) -1) + { + REPORT("Failed to mmap"); + return 0; + } + + return host_addr; +} + +/****************************************************************************** +* shmem_ondemand::unmap +******************************************************************************/ +void shmem_ondemand::unmap(void* host_addr, uint32_t size, bool is_write) +{ + // if (host_addr) munmap(host_addr, size); +} + +/****************************************************************************** +* shmem_cmem_persistent::shmem +******************************************************************************/ +shmem_cmem_persistent::shmem_cmem_persistent(int cmem_block) + : p_host_addr(0), p_xlate_dsp_to_host_offset(0), p_cmem_block(cmem_block) +{ } + +/****************************************************************************** +* shmem_cmem_persistent::init +* TODO: remove addr3, size3 once uboot is updated, so that we don't have +* have fragemented CMEM blocks for DDR +******************************************************************************/ +void shmem_cmem_persistent::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1, + DSPDevicePtr *addr2, uint32_t *size2, + DSPDevicePtr64 *addr3, uint64_t *size3) +{ + /*------------------------------------------------------------------------- + * Assume this is the only use of CMEM, so we reset everything + *------------------------------------------------------------------------*/ +#if 0 + const char *cmem_command = "modprobe -r cmemk; modprobe cmemk " + "phys_start=0xa2000000 phys_end=0x100000000 pools=1x1577058304 " + "phys_start_1=0x0c000000 phys_end_1=0x0c500000 pools_1=1x5242880 " + "allowOverlap=1"; + + int result = system(cmem_command); +#endif + + const char *cmem_command = "For available CMEM DDR block size: ~1.5GB:\n" + "modprobe cmemk " + "phys_start=0x823000000 phys_end=0x880000000 pools=1x1560281088 " + "phys_start_1=0x0C040000 phys_end_1=0x0C500000 " + "allowOverlap=1"; + const char *cmem_command2 = "For available CMEM DDR block size: ~3.5GB:\n" + "modprobe cmemk " + "phys_start=0x823000000 phys_end=0x900000000 pools=1x3707764736 " + "phys_start_1=0x0C040000 phys_end_1=0x0C500000 " + "allowOverlap=1"; + const char *cmem_command3 = "For available CMEM DDR block size: ~7.5GB:\n" + "modprobe cmemk " + "phys_start=0x823000000 phys_end=0xA00000000 pools=1x8002732032 " + "phys_start_1=0x0C040000 phys_end_1=0x0C500000 " + "allowOverlap=1"; + + /*------------------------------------------------------------------------- + * First initialize the CMEM module + *------------------------------------------------------------------------*/ + if (CMEM_init() == -1) + { + printf("\nThe cmemk kernel module does not appear to installed.\n\n" + "Commands such as the following run as root would " + "install cmemk\n" + "and allow OpenCL to proceed properly. The actual memory " + "address values for\n" + "your system may differ.\n\n"); + printf("%s\n\n", cmem_command); + printf("%s\n\n", cmem_command2); + printf("%s\n\n", cmem_command3); + exit(-1); + } + + /*------------------------------------------------------------------------- + * Debug to see in cmem init was correct + *------------------------------------------------------------------------*/ + int num_Blocks = 0; + CMEM_getNumBlocks(&num_Blocks); + if (num_Blocks < 2) + { + printf("\nOpenCL needs at least two CMEM blocks to operate properly.\n" + "One for DDR, the other for MSMC. Example commands:\n"); + printf("%s\n\n", cmem_command); + printf("%s\n\n", cmem_command2); + printf("%s\n\n", cmem_command3); + exit(-1); + } + + CMEM_BlockAttrs pattrs0 = {0, 0}; + CMEM_BlockAttrs pattrs1 = {0, 0}; + CMEM_BlockAttrs pattrs2 = {0, 0}; + + CMEM_getBlockAttrs(0, &pattrs0); + CMEM_getBlockAttrs(1, &pattrs1); + if (num_Blocks > 2) + CMEM_getBlockAttrs(2, &pattrs2); + + /*------------------------------------------------------------------------- + * Return 36-bit addr, and up to 7.5G memory size + *------------------------------------------------------------------------*/ + *addr1 = (DSPDevicePtr64) pattrs0.phys_base; + *size1 = (uint64_t) pattrs0.size; + // Persistent CMEM should start within 0x8:2200_0000 - 0x8:4000_0000 + if (*addr1 >= MPAX_USER_MAPPED_DSP_ADDR) + { + printf("Unable to allocate OCL persistent CMem from 0x%llx\n", + pattrs0.phys_base); + exit(EXIT_FAILURE); + } + + *addr2 = pattrs1.phys_base; + *size2 = pattrs1.size; + if (*addr2 < MSMC_OCL_START_ADDR || *addr2 >= MSMC_OCL_END_ADDR) + { + printf("Unable to allocate OCL MSMC memory from 0x%llx\n", + pattrs1.phys_base); + exit(EXIT_FAILURE); + } + + /*------------------------------------------------------------------------- + * Grab all available CMEM physical address, to be managed by OCL + *------------------------------------------------------------------------*/ + DSPDevicePtr64 alloc_dsp_addr = 0; + CMEM_AllocParams params = CMEM_DEFAULTPARAMS; + params.flags = CMEM_CACHED; + params.type = CMEM_POOL; + alloc_dsp_addr = CMEM_allocPoolPhys2(0, 0, ¶ms); + if (!alloc_dsp_addr || alloc_dsp_addr != *addr1) + { + printf("Failed to allocate 0x%llx from CMem 0, allocated=0x%llx\n", + *size1, alloc_dsp_addr); + exit(EXIT_FAILURE); + } + + params.type = CMEM_HEAP; + alloc_dsp_addr = CMEM_allocPhys2(1, *size2, ¶ms); + if (!alloc_dsp_addr || alloc_dsp_addr != *addr2) + { + printf("Failed to allocate 0x%x from CMem 1, allocated=0x%llx\n", + *size2, alloc_dsp_addr); + exit(EXIT_FAILURE); + } + + if (num_Blocks > 2) + { + *addr3 = pattrs2.phys_base; + *size3 = pattrs2.size; + params.type = CMEM_POOL; + alloc_dsp_addr = CMEM_allocPoolPhys2(2, 0, ¶ms); + if (!alloc_dsp_addr || alloc_dsp_addr != *addr3) + { + printf("Failed to allocate 0x%llx from CMem 2, allocated=0x%llx\n", + *size3, alloc_dsp_addr); + exit(EXIT_FAILURE); + } + } + else + { + *addr3 = 0; + *size3 = 0; + } +} + +/****************************************************************************** +* shmem_cmem_persistent::cmem_exit +******************************************************************************/ +void shmem_cmem_persistent::cmem_exit() +{ + /* Finalize the CMEM module */ + if (CMEM_exit() == -1) ERR(1, "Failed to finalize CMEM"); +} + +/****************************************************************************** +* shmem_cmem_persistent::configure +******************************************************************************/ +void shmem_cmem_persistent::configure(DSPDevicePtr64 dsp_addr, uint64_t size) +{ + p_dsp_addr = dsp_addr; + p_size = size; + DSPDevicePtr64 cmem_addr = p_dsp_addr; + if (p_dsp_addr >= 0xA0000000 && p_dsp_addr < 0xFFFFFFFF) + cmem_addr = p_dsp_addr - 0xA0000000 + 0x820000000ULL; + p_host_addr = CMEM_map(cmem_addr, size); + if (! p_host_addr) + ERR(1, "Cannot map CMEM physical memory into the Host virtual address space.\n" + " This is typically due to Linux system memory being near capacity."); + p_xlate_dsp_to_host_offset = (int64_t)p_host_addr - dsp_addr; +} + +/****************************************************************************** +* shmem_cmem_persistent::~shmem_cmem_persistent +******************************************************************************/ +shmem_cmem_persistent::~shmem_cmem_persistent() +{ + if (p_dsp_addr == 0) return; + + if (p_host_addr != NULL) CMEM_unmap(p_host_addr, p_size); + CMEM_AllocParams params = CMEM_DEFAULTPARAMS; + params.flags = CMEM_CACHED; + DSPDevicePtr64 cmem_addr = p_dsp_addr; + if (p_dsp_addr > 0xA0000000 && p_dsp_addr < 0xFFFFFFFF) + cmem_addr = p_dsp_addr - 0xA0000000 + 0x820000000ULL; + CMEM_freePhys(cmem_addr, ¶ms); +} + +/****************************************************************************** +* shmem_cmem_persistent::map: dsp_addr (phys) -> host_addr (virt) +******************************************************************************/ +void *shmem_cmem_persistent::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read) +{ + if (!p_host_addr || + dsp_addr < p_dsp_addr || dsp_addr + size > p_dsp_addr + p_size) + { + ERR(1, "Attempting to cmem_map a region outside a defined area"); + return NULL; + } + + void *host_addr = dsp_addr + (char*)p_xlate_dsp_to_host_offset; + if (is_read) CMEM_cacheInv(host_addr, size); + return host_addr; +} + +/****************************************************************************** +* shmem_cmem_persistent::unmap: flush host side writes +******************************************************************************/ +void shmem_cmem_persistent::unmap(void* host_addr, uint32_t size, bool is_write) +{ + if (host_addr && is_write) CMEM_cacheWb(host_addr, size); +} + + +/****************************************************************************** +* shmem_cmem_ondeman::configure +******************************************************************************/ +void shmem_cmem_ondemand::configure(DSPDevicePtr64 dsp_addr, uint64_t size) +{ + p_dsp_addr = dsp_addr; + p_size = size; +} + +/****************************************************************************** +* shmem_cmem_ondemand::map: dsp_addr (phys) -> host_addr (virt) +******************************************************************************/ +void *shmem_cmem_ondemand::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read) +{ + void *host_addr = CMEM_map(dsp_addr, size); + if (! host_addr) ERR(1, "Failed to map CMEM address (ondemand)"); + if (is_read) CMEM_cacheInv(host_addr, size); + return host_addr; +} + +/****************************************************************************** +* shmem_cmem_persistent::unmap: flush host side writes +******************************************************************************/ +void shmem_cmem_ondemand::unmap(void* host_addr, uint32_t size, bool is_write) +{ + if (host_addr && is_write) CMEM_cacheWb(host_addr, size); + if (host_addr) CMEM_unmap(host_addr, size); +} + +/****************************************************************************** +* shmem_cmem_ondemand::malloc: allocate CMEM physical address +* 64-bit size: could be allocating a buffer, then accessing smaller subbuffers +******************************************************************************/ +DSPDevicePtr64 shmem_cmem_ondemand::cmem_malloc(uint64_t size) +{ + CMEM_AllocParams params = CMEM_DEFAULTPARAMS; + params.flags = CMEM_CACHED; + params.type = CMEM_HEAP; + DSPDevicePtr64 addr = CMEM_allocPhys2(0, size, ¶ms); + if (!addr) + { + printf("Failed to allocate space 0x%llx from CMem\n", size); + exit(EXIT_FAILURE); + } + return addr; +} + +/****************************************************************************** +* shmem_cmem_ondemand::free: free allocated CMEM physical address +******************************************************************************/ +void shmem_cmem_ondemand::cmem_free(DSPDevicePtr64 addr) +{ + CMEM_AllocParams params = CMEM_DEFAULTPARAMS; + params.flags = CMEM_CACHED; + params.type = CMEM_HEAP; + CMEM_freePhys(addr, ¶ms); +} + diff --git a/src/core/dsp/shmem.h b/src/core/dsp/shmem.h new file mode 100644 index 0000000..03504a0 --- /dev/null +++ b/src/core/dsp/shmem.h @@ -0,0 +1,134 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include <stdint.h> +#ifndef _SHMEM_H +#define _SHMEM_H + +extern "C" +{ + #include <mpm_transport.h> +} +#include "dspmem.h" + +/*============================================================================= +* Abstract class for Shared memory +*============================================================================*/ +class shmem +{ + public: + shmem (); + virtual ~shmem (); + virtual void configure_base(DSPDevicePtr64 dsp_addr, uint64_t size); + virtual void configure (DSPDevicePtr64 dsp_addr, uint64_t size) = 0; + virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, + bool is_read=false) = 0; + virtual void unmap (void* host_addr, uint32_t size, + bool is_write=false) = 0; + uint32_t page_size (); + DSPDevicePtr64 start () { return p_dsp_addr; } + int64_t size () { return p_size; } + + protected: + DSPDevicePtr64 p_dsp_addr; + int64_t p_size; + uint32_t p_page_size; + int32_t p_mmap_fd; + mpm_transport_h p_mpm_transport_handle; + +}; + +/*============================================================================= +* Peristent implementation of shmem +*============================================================================*/ +class shmem_persistent : public shmem +{ + public: + shmem_persistent (); + ~shmem_persistent (); + void configure(DSPDevicePtr64 dsp_addr, uint64_t size); + virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false); + virtual void unmap (void* host_addr, uint32_t size, bool is_write=false); + + private: + void * p_host_addr; + void * p_xlate_dsp_to_host_offset; +}; + +/*============================================================================= +* On Demand implementation of shmem +*============================================================================*/ +class shmem_ondemand : public shmem +{ + public: + shmem_ondemand (); + ~shmem_ondemand (); + void configure(DSPDevicePtr64 dsp_addr, uint64_t size); + virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false); + virtual void unmap (void* host_addr, uint32_t size, bool is_write=false); +}; + +/*============================================================================= +* Peristent implementation of shmem using CMem +*============================================================================*/ +class shmem_cmem_persistent : public shmem +{ + public: + shmem_cmem_persistent (int cmem_block); + ~shmem_cmem_persistent (); + void configure(DSPDevicePtr64 dsp_addr, uint64_t size); + virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false); + virtual void unmap (void* host_addr, uint32_t size, bool is_write=false); + + static void cmem_init(DSPDevicePtr64* addr1, uint64_t* size1, + DSPDevicePtr* addr2, uint32_t* size2, + DSPDevicePtr64* addr3, uint64_t* size3); + static void cmem_exit(); + + private: + void * p_host_addr; + int64_t p_xlate_dsp_to_host_offset; + int p_cmem_block; +}; + +/*============================================================================= +* Ondemand implementation of shmem using CMem +*============================================================================*/ +class shmem_cmem_ondemand : public shmem +{ + public: + shmem_cmem_ondemand () {} + ~shmem_cmem_ondemand () {} + void configure(DSPDevicePtr64 dsp_addr, uint64_t size); + virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false); + virtual void unmap (void* host_addr, uint32_t size, bool is_write=false); + + static DSPDevicePtr64 cmem_malloc(uint64_t size); + static void cmem_free (DSPDevicePtr64 addr); +}; + +#endif // _SHMEM_H diff --git a/src/core/dsp/source_cache.h b/src/core/dsp/source_cache.h new file mode 100644 index 0000000..66b4400 --- /dev/null +++ b/src/core/dsp/source_cache.h @@ -0,0 +1,114 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef _source_cache_ +#define _source_cache_ + +#include <llvm/Support/raw_ostream.h> +#include <llvm/Bitcode/ReaderWriter.h> + +#include <boost/lexical_cast.hpp> +#include <boost/crc.hpp> + +#include <sys/stat.h> + +#include <string> +#include <iostream> +#include <sstream> +#include <vector> +#include <stdint.h> +#include "u_locks_pthread.h" +#include "database.h" + +class source_cache +{ + public: + void remember(std::string source) + { + uint32_t hash = get_crc(source); + std::string query("insert into programs(hash, source) values(" + + boost::lexical_cast<std::string>(hash) + + ", \"" + + source + + "\");"); + + p_database.query(query.c_str()); + } + + /*------------------------------------------------------------------------- + * Thread safe instance function for singleton behavior + *------------------------------------------------------------------------*/ + static source_cache* instance () + { + static Mutex Cache_instance_mutex; + source_cache* tmp = pInstance; + + __sync_synchronize(); + + if (tmp == 0) + { + ScopedLock lck(Cache_instance_mutex); + + tmp = pInstance; + if (tmp == 0) + { + char *user = getenv("USER"); + tmp = new source_cache("/tmp/opencl_source_" + string(user)); + __sync_synchronize(); + pInstance = tmp; + } + } + return tmp; + } + + + private: + static source_cache* pInstance; + std::string p_dbname; + Database p_database; + + private: + source_cache(std::string db_name) : p_dbname(db_name), p_database(db_name.c_str()) + { + p_database.query("create table if not exists " + "programs(hash integer, source string);"); + } + + uint32_t get_crc(std::string& my_string) + { + boost::crc_32_type result; + result.process_bytes(my_string.data(), my_string.length()); + return result.checksum(); + } + + source_cache(const source_cache&); // copy ctor disallowed + source_cache& operator=(const source_cache&); // assignment disallowed +}; + +#endif // _source_cache_ + + diff --git a/src/core/dsp/u_concurrent_map.h b/src/core/dsp/u_concurrent_map.h new file mode 100644 index 0000000..014c0b6 --- /dev/null +++ b/src/core/dsp/u_concurrent_map.h @@ -0,0 +1,137 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +/**************************************************************************//** +* +* @file u_concurrent_map.h +* @brief TI implementation class that implements a thread safe map. +* +******************************************************************************/ +#ifndef _U_CONCURRENT_MAP_H_ +#define _U_CONCURRENT_MAP_H_ + +#include <iostream> +#include <map> +#include "u_lockable.h" + +/**************************************************************************//** +* @class concurrent_map +* +* @brief A thread safe map implementation +* +* @details This implementation wraps a standard stl map with some locking +* capability to make the member functions mutually exclusive +* regions. In derives from the class Lockable which defines a type +* Lock that can be used to define a type in a scope. The result will +* be that the remainder of the scope (or until unlock is called) is a +* mutex. +* +******************************************************************************/ +template<typename I, typename T> +class concurrent_map : public Lockable +{ +public: + concurrent_map() : M(), num_elements(0) {} + ~concurrent_map() {} + + /**********************************************************************//** + * @brief Place an object in the map. + * @param data is the item to psh on the map + ***************************************************************************/ + void push(I index, T const data) + { + Lock lock(this); + M[index] = data; + num_elements++; + } + + /**********************************************************************//** + * @brief How many elements are in the map. + * @returns The number of elements in the map. + ***************************************************************************/ + int size() const + { + Lock lock(this); + return num_elements; + } + + /**********************************************************************//** + * @brief Determine if the map is empty. + * @returns true if the map is empty, otherwise false. + ***************************************************************************/ + bool empty() const + { + Lock lock(this); + return (num_elements == 0); + } + + /**********************************************************************//** + * @brief Attempt to pop an item off the map. + * @param popped_value is an output parameter that contains the object popped + * if the map is successfully popped. + * @returns true if a value is popped, otherwise false + ***************************************************************************/ + bool try_pop(I idx, T& popped_value) + { + Lock lock(this); + if (num_elements == 0) return false; + + typename std::map<I,T>::iterator it = M.find(idx); + + if (it != M.end()) + { + popped_value = it->second; + M.erase (it); + num_elements--; + return true; + } + + return false; + } + + void dump() + { + for (typename std::map<I,T>::const_iterator i = M.begin(); i != M.end(); ++i) + std::cout << i->first << " ==> " << i->second << std::endl; + } + + /*------------------------------------------------------------------------- + * The class's data + *------------------------------------------------------------------------*/ +private: + std::map<I,T> M; //!< standard stl map + int num_elements; + + /*------------------------------------------------------------------------- + * Prevent copy construction and assignment + *------------------------------------------------------------------------*/ +private: + concurrent_map(const concurrent_map&); + concurrent_map& operator=(const concurrent_map&); +}; + +#endif //_U_CONCURRENT_MAP_H_ diff --git a/src/core/dsp/u_concurrent_stack.h b/src/core/dsp/u_concurrent_stack.h new file mode 100644 index 0000000..6e9755b --- /dev/null +++ b/src/core/dsp/u_concurrent_stack.h @@ -0,0 +1,124 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +/**************************************************************************//** +* +* @file u_concurrent_stack.h +* @brief TI implementation class that implements a thread safe stack. +* +******************************************************************************/ +#ifndef _U_CONCURRENT_STACK_H_ +#define _U_CONCURRENT_STACK_H_ + +#include <iostream> +#include <stack> +#include "u_lockable.h" + +/**************************************************************************//** +* @class concurrent_stack +* +* @brief A thread safe stack implementation +* +* @details This implementation wraps a standard stl stack with some locking +* capability to make the member functions mutually exclusive +* regions. In derives from the class Lockable which defines a type +* Lock that can be used to define a type in a scope. The result will +* be that the remainder of the scope (or until unlock is called) is a +* mutex. +* +******************************************************************************/ +template<typename T> +class concurrent_stack : public Lockable +{ +public: + concurrent_stack() : S(), num_elements(0) {} + ~concurrent_stack() {} + + /**********************************************************************//** + * @brief Place an object in the stack. + * @param data is the item to psh on the stack + ***************************************************************************/ + void push(T const data) + { + Lock lock(this); + S.push(data); + num_elements++; + } + + /**********************************************************************//** + * @brief How many elements are in the stack. + * @returns The number of elements in the stack. + ***************************************************************************/ + int size() const + { + Lock lock(this); + return num_elements; + } + + /**********************************************************************//** + * @brief Determine if the stack is empty. + * @returns true if the stack is empty, otherwise false. + ***************************************************************************/ + bool empty() const + { + Lock lock(this); + return (num_elements == 0); + } + + /**********************************************************************//** + * @brief Attempt to pop an item off the stack. + * @param popped_value is an output parameter that contains the object popped + * if the stack is successfully popped. + * @returns true if a value is popped, otherwise false + ***************************************************************************/ + bool pop(T& popped_value) + { + Lock lock(this); + if (num_elements == 0) return false; + + popped_value = S.top(); + S.pop(); + num_elements--; + return true; + } + + /*------------------------------------------------------------------------- + * The class's data + *------------------------------------------------------------------------*/ +private: + std::stack<T> S; //!< standard stl stack + int num_elements; + + /*------------------------------------------------------------------------- + * Prevent copy construction and assignment + *------------------------------------------------------------------------*/ +private: + concurrent_stack(const concurrent_stack&); + concurrent_stack& operator=(const concurrent_stack&); +}; + +#endif //_U_CONCURRENT_STACK_H_ diff --git a/src/core/dsp/u_lockable.h b/src/core/dsp/u_lockable.h new file mode 100644 index 0000000..803197f --- /dev/null +++ b/src/core/dsp/u_lockable.h @@ -0,0 +1,109 @@ +/****************************************************************************** +* The Loki Library +* Copyright (c) 2001 by Andrei Alexandrescu +* Copyright (c) 2010-2014, Texas Instruments Incorporated +* +* This code accompanies the book: +* Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design +* Patterns Applied". Copyright (c) 2001. Addison-Wesley. +* Permission to use, copy, modify, distribute and sell this software for any +* purpose is hereby granted without fee, provided that the above copyright +* notice appear in all copies and that both that copyright notice and this +* permission notice appear in supporting documentation. +* The author or Addison-Wesley Longman make no representations about the +* suitability of this software for any purpose. It is provided "as is" +* without express or implied warranty. +******************************************************************************/ + +/**************************************************************************//** +* +* @file u_lockable.h +* +* @brief Defines a base class that provides a derived class with a Lock type. +* +* @version 1.00.00 +* +* @note The Locakable class is a modified version of the ObjectLevelLockable +* class from the LOKI library. The copyright from that library is +* included at the top of this file. +* +******************************************************************************/ +#ifndef _U_LOCKABLE_H_ +#define _U_LOCKABLE_H_ +#include "u_locks_pthread.h" + +/**************************************************************************//** +* @brief used as a base class to give your derived class a Lock type. +* @details Have a class derive from this class and you can lock member +* functions of your class by defining a lock like this +* Lock lock(this); +******************************************************************************/ +class Lockable +{ + public: + Lockable() : mutex() {} //!< Default Constructor + Lockable(const Lockable&) : mutex() {} //!< Copy Constructor + ~Lockable() {} //!< Destructor + + /**********************************************************************//** + * @brief The Lock type defined by inheriting from Lockable. + **************************************************************************/ + class Lock + { + public: + + /*******************************************************************//** + * @brief Constructing a Lock object will lock the parent object's mutex + ***********************************************************************/ + explicit Lock(const Lockable* host_) : host(*host_) + { host.mutex.Lock(); } + + /*******************************************************************//** + * @brief Destructing a Lock object will unlock the parent object's mutex + ***********************************************************************/ + ~Lock() { host.mutex.Unlock(); } + + /*******************************************************************//** + * @brief Unlock the parent object's mutex + ***********************************************************************/ + void unlock() { host.mutex.Unlock(); } + + /*******************************************************************//** + * @brief Return a raw pointer to the parent object's mutex + ***********************************************************************/ + Mutex* raw() { return &host.mutex; } + + private: + const Lockable& host; //!< a pointer back to the parent object + + private: // prevent copy construction and assignment + Lock(const Lock&); + Lock& operator=(const Lock&); + }; + + protected: + mutable Mutex mutex; +}; + +/*----------------------------------------------------------------------------- +* Can use to turn off locking without chaning client code using Lockable +*----------------------------------------------------------------------------*/ +class Lockable_off +{ + public: + Lockable_off() {} + + class Lock + { + public: + + explicit Lock(const Lockable_off* host_) { } + void unlock() { } + + private: // prevent copy construction and assignment + Lock(const Lock&); + Lock& operator=(const Lock&); + }; +}; + +#endif diff --git a/src/core/dsp/u_locks_pthread.h b/src/core/dsp/u_locks_pthread.h new file mode 100644 index 0000000..4663a57 --- /dev/null +++ b/src/core/dsp/u_locks_pthread.h @@ -0,0 +1,137 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +/**************************************************************************//** +* +* @file u_locks_pthread.h +* +* @brief TI implementation classes for mutual exclusion and locking. +* +* @ingroup Utilities +* +* @version 1.00.00 +* +******************************************************************************/ +#ifndef _U_LOCKS_PTHREAD_H_ +#define _U_LOCKS_PTHREAD_H_ + +#include <pthread.h> + +/**************************************************************************//** +* @brief Simple mutex implemented using the pthreads library +* +* @details This mutex is simply a wrapper around a pthread mutex. Two regions +* of code cannot have the mutex locked at the same time. +* +******************************************************************************/ +class Mutex +{ + public: + Mutex() { pthread_mutex_init (&mutex, 0); } //!< Construct a mutex + ~Mutex() { pthread_mutex_destroy(&mutex); } //!< Destruct a mutex + void Lock() { pthread_mutex_lock (&mutex); } //!< Lock a mutex + void Unlock() { pthread_mutex_unlock (&mutex); } //!< Unlock a mutex + + pthread_mutex_t* raw() { return &mutex; } //!< Return raw ptr to underlying + + private: + pthread_mutex_t mutex; //!< The underlying pthread mutex + + private: // prevent copy construction and assignment + Mutex(const Mutex &); + Mutex & operator = (const Mutex &); +}; + +/**************************************************************************//** +* @brief Simple condition variable implemented using the pthreads library. +* +* @details Condition variables are synchronization primitives that enable +* threads to wait until a particular condition occurs. Condition +* variables enable threads to atomically release a lock and sleep. +* Condition variables support operations that "wake one" or +* "wake all" waiting threads. After a thread is woken, it +* re-acquires the lock it released when the thread entered the +* sleeping state. +* +******************************************************************************/ +class CondVar +{ + public: + + CondVar() { pthread_cond_init (&cond, 0); } //!< Constructor + ~CondVar() { pthread_cond_destroy (&cond); } //!< Destructor + + /**********************************************************************//** + * @brief Signal 1 of N threads waiting on the condition variable + **************************************************************************/ + void notify_one() { pthread_cond_signal (&cond); } + + /**********************************************************************//** + * @brief Signal all N threads waiting on the condition variable + **************************************************************************/ + void notify_all() { pthread_cond_broadcast(&cond); } + + /**********************************************************************//** + * @brief Wait on the condition variable and release the passed mutex. + **************************************************************************/ + void wait(Mutex* m) { pthread_cond_wait(&cond, m->raw()); } + + private: + pthread_cond_t cond; //!< The underlying pthread condition variable + + private: // prevent copy construction and assignment + CondVar(CondVar&); + CondVar& operator=(CondVar&); +}; + +/**************************************************************************//** +* @brief Objects of this type lock the remainder of the enclosing scope. +* +* @details Declare one of these in a scope and pass a mutex reference and the +* mutex will be locked for the remainder of the scope. This is a +* safer way to lock and unlock a mutex, because the mutex will +* automatically be unlocked when the scope level is exited. This +* helps prevent an unlocked mutex from occuring during exceptions or +* forgotten early function returns. +* +******************************************************************************/ +class ScopedLock +{ + public: + ScopedLock(Mutex &m) : mutex(m) { mutex.Lock(); } //!< Constructor + ~ScopedLock() { mutex.Unlock(); } //!< Destructor + + private: + //mutable + Mutex& mutex; //!< The Underlying mutex reference + + private: // prevent copy construction and assignment + ScopedLock(const ScopedLock&); + ScopedLock& operator=(const ScopedLock&); +}; + +#endif diff --git a/src/core/dsp/utils.h b/src/core/dsp/utils.h new file mode 100644 index 0000000..f125ebd --- /dev/null +++ b/src/core/dsp/utils.h @@ -0,0 +1,85 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __UTILS_H +#define __UTILS_H + +/** + * \brief Increment a n-component vector given a maximum value + * + * This function is used to increment a vector for which a set of maximum values + * each of its element can reach before the next is incremented. + * + * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and + * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the + * same vector will produce the following results : + * + * \code + * {0, 0, 1} + * {0, 1, 0} + * {0, 1, 1} + * {0, 2, 0} + * {0, 2, 1} + * {0, 3, 0} + * {0, 3, 1} + * {1, 0, 0} + * ... + * \endcode + * + * Until \p vec reaches <tt>{2, 3, 1}</tt>. + * + * \param dims number of elements in the vectors + * \param vec vector whose elements will be incremented + * \param maxs vector containing a maximum value above which each corresponding + * element of \p vec cannot go. + * \return false if the increment was ok, true if \p vec was already at it's + * maximum value and couldn't be further incremented. + */ +template<typename T> +bool incVec(unsigned long dims, T *vec, T *maxs) +{ + bool overflow = false; + + for (unsigned int i=0; i<dims; ++i) + { + vec[i] += 1; + + if (vec[i] > maxs[i]) + { + vec[i] = 0; + overflow = true; + } + else + { + overflow = false; + break; + } + } + + return overflow; +} +#endif diff --git a/src/core/dsp/wga.cpp b/src/core/dsp/wga.cpp new file mode 100644 index 0000000..8269898 --- /dev/null +++ b/src/core/dsp/wga.cpp @@ -0,0 +1,464 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "wga.h" +#include <iostream> +#include <llvm/Pass.h> +#include <llvm/IR/Function.h> +#include <llvm/IR/Module.h> +#include <llvm/IR/BasicBlock.h> +#include <llvm/IR/DataLayout.h> +#include <llvm/Support/raw_ostream.h> +#include <llvm/Support/InstIterator.h> +#include <llvm/IR/IntrinsicInst.h> +#include "llvm/Support/CFG.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "boost/assign/std/set.hpp" +#include <stdio.h> + +using namespace std; +using namespace boost::assign; + +namespace llvm +{ + +/****************************************************************************** +* createTIOpenclWorkGroupAggregation +******************************************************************************/ +Pass *createTIOpenclWorkGroupAggregationPass(bool is_pocl_mode) +{ + TIOpenclWorkGroupAggregation *fp = new TIOpenclWorkGroupAggregation( + is_pocl_mode); + return fp; +} + +/************************************************************************** +* Constructor +**************************************************************************/ +TIOpenclWorkGroupAggregation::TIOpenclWorkGroupAggregation(bool pocl_mode) : + FunctionPass(ID), is_pocl_mode(pocl_mode) +{ + for (int i = 0; i < MAX_DIMENSIONS; ++i) IVPhi[i] = 0; +} + +/************************************************************************** +* Get index variable +* 1. Original mode, only one loop inserted: return IVPhi[] +* 2. pocl mode, multiple loops inserted: return a new LoadInst +**************************************************************************/ +llvm::Instruction* TIOpenclWorkGroupAggregation::get_IV(Function &F, + CallInst *call) +{ + llvm::Value *ivx, *ivy, *ivz; + Value *arg = call->getArgOperand(0); + uint32_t dim = 9999; + + if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg)) + dim = constInt->getSExtValue(); + + if (is_pocl_mode) + { + llvm::GlobalValue *iv; + if (dim == 2) + iv = F.getParent()->getNamedGlobal("_local_id_z"); + else if (dim == 1) + iv = F.getParent()->getNamedGlobal("_local_id_y"); + else if (dim == 0) + iv = F.getParent()->getNamedGlobal("_local_id_x"); + if (dim != 9999) return new LoadInst(iv); + + ivx = F.getParent()->getNamedGlobal("_local_id_x"); + ivy = F.getParent()->getNamedGlobal("_local_id_y"); + ivz = F.getParent()->getNamedGlobal("_local_id_z"); + } + else + { + if (dim != 9999) return IVPhi[dim]; + + ivx = IVPhi[0]; + ivy = IVPhi[1]; + ivz = IVPhi[2]; + } + + // not constant arg: return (arg == 2) ? ivz : (arg == 1 ? ivy : ivx) + Type *Int32 = Type::getInt32Ty(F.getContext()); + Value *one = ConstantInt::get(Int32, 1); + Value *two = ConstantInt::get(Int32, 2); + llvm::Value *cyx = new ICmpInst(call, ICmpInst::ICMP_EQ, arg, two); + llvm::Value *syx = SelectInst::Create(cyx, ivy, ivx, "", call); + llvm::Value *czyx = new ICmpInst(call, ICmpInst::ICMP_EQ, arg, one); + return SelectInst::Create(czyx, ivz, syx, "", is_pocl_mode ? NULL : call); +} + +/************************************************************************** +* runOnFunction(Function &F) +**************************************************************************/ +bool TIOpenclWorkGroupAggregation::runOnFunction(Function &F) +{ + /*------------------------------------------------------------------------- + * Determine how many dimensions are referenced using OpenCL getXXX + * functions, and record them all for later rewrite. + *------------------------------------------------------------------------*/ + int dims; + if (!is_pocl_mode) dims = findNeededLoopNest(F); + + /*------------------------------------------------------------------------- + * Add a loop nest for each dimension referenced that requires a workitem + * id. + *------------------------------------------------------------------------*/ + if (!is_pocl_mode) for (int i = 0; i < dims; ++i) add_loop(F, i); + + /*------------------------------------------------------------------------- + * rewrite the alloca() generated during pocl llvm work-group aggregation + *------------------------------------------------------------------------*/ + if (is_pocl_mode) rewrite_allocas(F); + + /*------------------------------------------------------------------------- + * rewrite the OpenCL getXXX dimension query functions to reference the info + * packet for the workgroup. Return true if we modified the function. + *------------------------------------------------------------------------*/ + return rewrite_ocl_funcs(F); +} + +/****************************************************************************** +* getAnalysisUsage(AnalysisUsage &Info) const +******************************************************************************/ +void TIOpenclWorkGroupAggregation::getAnalysisUsage(AnalysisUsage &Info) const +{ + /*------------------------------------------------------------------------- + * This will ensure that all returns go through a single exit node, which + * our WGA loop generation algorithm depends on. + *------------------------------------------------------------------------*/ + Info.addRequired<UnifyFunctionExitNodes>(); +} + +/************************************************************************** +* findNeededLoopNest(Function &F) +**************************************************************************/ +unsigned int TIOpenclWorkGroupAggregation::findNeededLoopNest(Function &F) +{ + unsigned int maxDim = 0; + + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) + if (CallInst * callInst = dyn_cast<CallInst>(&*I)) + { + if (!callInst->getCalledFunction()) continue; + string functionName(callInst->getCalledFunction()->getName()); + + if (functionName == "get_local_id" || + functionName == "get_global_id") + { + Value *arg = callInst->getArgOperand(0); + if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg)) + { + unsigned int dimIdx = constInt->getSExtValue(); + dimIdx = min(MAX_DIMENSIONS-1, dimIdx); + maxDim = max(maxDim, dimIdx + 1); + } + + /*------------------------------------------------------------- + * if the work group function has a variable argument, then + * assume worst case and return 3 loop levels are needed. + *------------------------------------------------------------*/ + else return 3; + } + } + + return maxDim; +} + +/************************************************************************** +* createLoadGlobal +* Create an aligned 32 bit load from a global address. +**************************************************************************/ +Instruction* TIOpenclWorkGroupAggregation::createLoadGlobal + (int32_t idx, Module* M, Instruction *before, const char *name) +{ + llvm::ArrayType *type = ArrayType::get( + IntegerType::getInt32Ty(getGlobalContext()), 64); + llvm::Value* dummy = M->getOrInsertGlobal("kernel_config_l2", type); + + GlobalVariable* global = M->getNamedGlobal("kernel_config_l2"); + + std::vector<Value*> indices; + indices.push_back(ConstantInt::get(IntegerType::getInt32Ty(getGlobalContext()), 0)); + indices.push_back(ConstantInt::get(IntegerType::getInt32Ty(getGlobalContext()), idx)); + + Constant* gep = ConstantExpr::getInBoundsGetElementPtr (global, indices); + LoadInst* ld = new LoadInst(gep, name, before); + + ld->setAlignment(4); + return ld; +} + +/****************************************************************************** +* findDim +******************************************************************************/ +unsigned int TIOpenclWorkGroupAggregation::findDim(class CallInst* call) +{ + Value *arg = call->getArgOperand(0); + + if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg)) + return constInt->getSExtValue(); + return 100; // who knows +} + +/************************************************************************** +* rewrite allocas to _wg_alloca(sizeinbytes) +**************************************************************************/ +bool TIOpenclWorkGroupAggregation::rewrite_allocas(Function &F) +{ + int wi_alloca_size = 0; + Module *M = F.getParent(); + AllocaInst *alloca; + + std::vector<AllocaInst *> allocas; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) + if ((alloca = dyn_cast<AllocaInst>(&*I)) != NULL) + allocas.push_back(alloca); + if (allocas.empty()) return false; + + DataLayout dataLayout(M); + FunctionType *ft = FunctionType::get + (/*Result=*/ IntegerType::get(M->getContext(), 32), + /*Params=*/ IntegerType::get(M->getContext(), 32), + /*isVarArg=*/ false); + Function *wg_alloca = dyn_cast<Function>( + M->getOrInsertFunction("_wg_alloca", ft)); + Type *Int32 = Type::getInt32Ty(M->getContext()); + + for (std::vector<AllocaInst *>::iterator I = allocas.begin(); + I != allocas.end(); ++I) + { + alloca = *I; + + // get number of elements, element type size, compute total size + Value *numElems = alloca->getArraySize(); + // YUAN TODO: skip regular constant numElems? + + Type *elementType = alloca->getAllocatedType(); + // getTypeSizeInBits(), what about uchar3 type? + uint64_t esBytes = dataLayout.getTypeStoreSize(elementType); + Value *esize = ConstantInt::get(Int32, (uint32_t) esBytes); + Instruction *alloca_size = BinaryOperator::Create( + Instruction::Mul, esize, numElems, "", alloca); + SmallVector<Value *, 4> args; + args.push_back(alloca_size); + + // create function call: _wg_alloca(alloca_size) + CallInst *f_alloca = CallInst::Create( + wg_alloca, ArrayRef<Value *>(args), "", alloca); + + // cast to alloca type + Instruction * new_alloca = new IntToPtrInst( + f_alloca, alloca->getType()); + + // replace AllocaInst with new _wg_alloca() + ReplaceInstWithInst(alloca, new_alloca); + + // accumulate element type size + unsigned align = dataLayout.getPrefTypeAlignment(elementType); + wi_alloca_size = (wi_alloca_size + align - 1) & (~(align-1)); + wi_alloca_size += esBytes; + } + + // initialize _wg_alloca_start and _wg_alloca_size + // _wg_alloca_size = load(packetaddr+offset); + // _wg_alloca_start = load(packetaddr+offset) + __core_num() * _wg_alloca_size; + Instruction *inspt = F.getEntryBlock().getFirstNonPHI(); + FunctionType *core_num_ft = FunctionType::get + (/*Result=*/ IntegerType::get(M->getContext(), 32), + /*isVarArg=*/ false); + Function *core_num = dyn_cast<Function>( + M->getOrInsertFunction("__core_num", core_num_ft)); + Instruction *f_core_num = CallInst::Create(core_num, "", inspt); + + Instruction *wg_alloca_size = createLoadGlobal(17, M, inspt); + + Instruction *shift = BinaryOperator::Create(Instruction::Mul, f_core_num, + wg_alloca_size, "", inspt); + + Instruction *start = createLoadGlobal(16, M, inspt); + + Instruction *core_start = BinaryOperator::Create( + Instruction::Add, start, shift, "", inspt); + Value *gv = M->getOrInsertGlobal("_wg_alloca_start", Int32); + GlobalVariable *wg_gv = M->getNamedGlobal("_wg_alloca_start"); + wg_gv->setSection(StringRef("far")); + Instruction *store = new StoreInst(core_start, gv, inspt); + + // put total orig_wi_size into attributes data in the function + char *s_wi_alloca_size = new char[32]; // we have to leak this + snprintf(s_wi_alloca_size, 32, "_wi_alloca_size=%d", wi_alloca_size); + F.addFnAttr(StringRef(s_wi_alloca_size)); + + return true; +} + +/************************************************************************** +* rewrite_ocl_funcs +**************************************************************************/ +bool TIOpenclWorkGroupAggregation::rewrite_ocl_funcs(Function &F) +{ + CallInst *call; + Module *M = F.getParent(); + std::vector<CallInst *> wi_calls; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) + { + if ((call = dyn_cast<CallInst>(&*I)) == NULL) continue; + if (call->getCalledFunction() == NULL) continue; + string name(call->getCalledFunction()->getName()); + if (name != "get_local_id" && name != "get_local_size") continue; + wi_calls.push_back(call); + } + if (wi_calls.empty()) return false; + + LLVMContext &ctx = F.getContext(); + std::vector<CallInst *>::iterator I, E; + for (I = wi_calls.begin(), E = wi_calls.end(); I != E; ++I) + { + call = *I; + string name(call->getCalledFunction()->getName()); + + if (name == "get_local_id") + { + if (is_pocl_mode) + { + ReplaceInstWithInst(call, get_IV(F, call)); + } + else + { + BasicBlock::iterator BI(call); + ReplaceInstWithValue(call->getParent()->getInstList(), BI, + get_IV(F, call)); + } + } + else if (name == "get_local_size") + { + // remaining get_local_size() are generated by pocl, + // arguments guaranteed to be constants: 0, 1, or 2 + ReplaceInstWithInst(call, + createLoadGlobal(4+findDim(call), M)); + } + } + return true; +} + +BasicBlock* TIOpenclWorkGroupAggregation::findExitBlock(Function &F) +{ + BasicBlock *exit = 0; + + /*------------------------------------------------------------------------- + * Find the one block with no successors + *------------------------------------------------------------------------*/ + for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) + if ((*B).getTerminator()->getNumSuccessors() == 0) + if (!exit) exit = &(*B); + else assert(false); + + /*------------------------------------------------------------------------- + * Split the return off into it's own block + *------------------------------------------------------------------------*/ + Instruction *ret = exit->getTerminator(); + + if (ret != &exit->front()) + exit = SplitBlock(exit, ret, this); + + return exit; +} + +/************************************************************************** +* add_loop(Function &F) +**************************************************************************/ +void TIOpenclWorkGroupAggregation::add_loop(Function &F, int dimIdx) +{ + LLVMContext &ctx = F.getContext(); + Type *Int32 = Type::getInt32Ty(ctx); + Value *zero = ConstantInt::get(Int32, 0); + Value *one = ConstantInt::get(Int32, 1); + Module *M = F.getParent(); + + BasicBlock* exit = findExitBlock(F); + BasicBlock* entry = &(F.getEntryBlock()); + + BasicBlock* bodytop = SplitBlock(entry, &entry->front(), this); + BasicBlock* bodyend = exit; + exit = SplitBlock(bodyend, &exit->front(), this); + + exit->setName(".exit"); + entry->setName(".entry"); + bodytop->setName(".bodyTop"); + bodyend->setName(".bodyEnd"); + + /*---------------------------------------------------------------------- + * Populate the branch around + *---------------------------------------------------------------------*/ + Instruction *branch = entry->getTerminator(); + Instruction *ld_upper_bnd = createLoadGlobal(4+dimIdx, M, branch); + + Instruction *cmp = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_SGT, + ld_upper_bnd, zero, "", branch); + + Instruction *cbr = BranchInst::Create(bodytop, exit, cmp); + ReplaceInstWithInst(branch, cbr); + + /*---------------------------------------------------------------------- + * Add the phi node to the top of the body + *---------------------------------------------------------------------*/ + PHINode *phi = PHINode::Create(Int32, 0, "", &bodytop->front()); + phi->addIncoming(zero, entry); + + /*---------------------------------------------------------------------- + * Add the loop control to the bottom of the bodyend + *---------------------------------------------------------------------*/ + branch = bodyend->getTerminator(); + Instruction *inc = BinaryOperator::Create(Instruction::Add, + phi, one, Twine(), branch); + + Instruction *ld_upper_bnd2 = createLoadGlobal(4+dimIdx, M, branch); + Instruction *cmp2 = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_SLT, + inc, ld_upper_bnd2, "", branch); + + Instruction *cbr2 = BranchInst::Create(bodytop, exit, cmp2); + ReplaceInstWithInst(branch, cbr2); + + phi->addIncoming(inc, bodyend); + IVPhi[dimIdx] = phi; + + // YUAN TODO: maybe handled better later + if (dimIdx < 1) IVPhi[1] = phi; + if (dimIdx < 2) IVPhi[2] = phi; +} + +char TIOpenclWorkGroupAggregation::ID = 0; +static RegisterPass<TIOpenclWorkGroupAggregation> + X("wga", "Work Group Aggregation", false, false); + +} diff --git a/src/core/dsp/wga.h b/src/core/dsp/wga.h new file mode 100644 index 0000000..8728fea --- /dev/null +++ b/src/core/dsp/wga.h @@ -0,0 +1,72 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __TIOPENCLWORKGROUPAGGREGATIONPASS_H +#define __TIOPENCLWORKGROUPAGGREGATIONPASS_H + +#include <string> +#include <set> +#include "boost/tuple/tuple.hpp" +#include <llvm/Pass.h> +#include <llvm/IR/Instruction.h> + +#define MAX_DIMENSIONS 3u + +namespace llvm +{ + +class TIOpenclWorkGroupAggregation : public FunctionPass +{ + public: + static char ID; + + TIOpenclWorkGroupAggregation(bool pocl_mode = false); + virtual bool runOnFunction(Function &F); + virtual void getAnalysisUsage(AnalysisUsage &Info) const; + + private: + Instruction* IVPhi[MAX_DIMENSIONS]; + bool is_pocl_mode; + + private: + Instruction* createLoadGlobal(int32_t idx, Module* m, Instruction *before=0, + const char *name=0); + + BasicBlock* findExitBlock (Function &F); + unsigned int findNeededLoopNest(Function &F); + unsigned int findDim (class CallInst* call); + bool rewrite_ocl_funcs (Function &F); + void add_loop (Function &F, int dimIdx); + Instruction* get_IV(Function &F, CallInst *call); + bool rewrite_allocas(Function &F); +}; + +Pass *createTIOpenclWorkGroupAggregationPass(bool is_pocl_mode = false); + +} + +#endif // __TIOPENCLWORKGROUPAGGREGATIONPASS_H diff --git a/src/core/dsp/worker.cpp b/src/core/dsp/worker.cpp new file mode 100644 index 0000000..79223f0 --- /dev/null +++ b/src/core/dsp/worker.cpp @@ -0,0 +1,519 @@ +/****************************************************************************** + * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "device.h" +#include "buffer.h" +#include "kernel.h" +#include "driver.h" + +#include "../commandqueue.h" +#include "../events.h" +#include "../memobject.h" +#include "../kernel.h" + +#include <stdlib.h> +#include <iostream> +#include <string.h> + +#include "u_locks_pthread.h" + +using namespace Coal; + +#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); } + +/****************************************************************************** +* handle_event_completion +******************************************************************************/ +void handle_event_completion(DSPDevice *device) +{ + int k_id = device->mail_from(); + + /*------------------------------------------------------------------------- + * If this is a false completion message due to prinft traffic, etc. + *------------------------------------------------------------------------*/ + if (k_id < 0) return; + + Event* event; + bool found = device->get_complete_pending(k_id, event); + if (!found) + { + std::cout << "Completion status received for kernel Id " << k_id << + " but no pending event found for that id" << std::endl; + exit(-1); + } + + KernelEvent *e = (KernelEvent *) event; + DSPKernelEvent *ke = (DSPKernelEvent *)e->deviceData(); + ke->free_tmp_bufs(); + + CommandQueue *queue = 0; + cl_command_queue_properties queue_props = 0; + + event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0); + + if (queue) + queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), + &queue_props, 0); + + // an event may be released once it is Complete + if (queue_props & CL_QUEUE_PROFILING_ENABLE) + event->updateTiming(Event::End); + event->setStatus(Event::Complete); +} + + +/****************************************************************************** +* handle_event_dispatch +******************************************************************************/ +bool handle_event_dispatch(DSPDevice *device) +{ + bool stop = false; + cl_int errcode; + Event * event; + + event = device->getEvent(stop); + + /*--------------------------------------------------------------------- + * Ensure we have a good event and we don't have to stop + *--------------------------------------------------------------------*/ + if (stop) return true; + if (!event) return false; + + /*--------------------------------------------------------------------- + * Get info about the event and its command queue + *--------------------------------------------------------------------*/ + Event::Type t = event->type(); + CommandQueue * queue = 0; + cl_command_queue_properties queue_props = 0; + + errcode = CL_SUCCESS; + + event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0); + + if (queue) + queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), + &queue_props, 0); + + if (queue_props & CL_QUEUE_PROFILING_ENABLE) + event->updateTiming(Event::Start); + + /*--------------------------------------------------------------------- + * Execute the action + *--------------------------------------------------------------------*/ + switch (t) + { + case Event::ReadBuffer: + case Event::WriteBuffer: + { + ReadWriteBufferEvent *e = (ReadWriteBufferEvent *)event; + + if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) + { + if (t == Event::ReadBuffer) + memcpy(e->ptr(), e->buffer()->host_ptr(), e->cb()); + else memcpy(e->buffer()->host_ptr(), e->ptr(), e->cb()); + break; + } + + DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device); + DSPDevicePtr64 data = (DSPDevicePtr64)buf->data() + e->offset(); + + if (t == Event::ReadBuffer) + Driver::instance()->read(device->dspID(), data, + (uint8_t*)e->ptr(), e->cb()); + + else + Driver::instance()->write(device->dspID(), data, + (uint8_t*)e->ptr(), e->cb()); + + break; + } + + case Event::CopyBuffer: + { +#ifdef DSPC868X + std::cerr << "Event type not yet supported" << std::endl; +#else + CopyBufferEvent *e = (CopyBufferEvent *)event; + + DSPDevicePtr64 src_addr; + DSPDevicePtr64 dst_addr; + + void *psrc; + void *pdst; + + if (e->source()->flags() & CL_MEM_USE_HOST_PTR) + psrc = (char*)e->source()->host_ptr() + e->src_offset(); + else + { + DSPBuffer *src = (DSPBuffer*)e->source()->deviceBuffer(device); + src_addr = (DSPDevicePtr64)src->data() + e->src_offset(); + psrc = Driver::instance()->map(src_addr, e->cb(), true); + } + + if (e->destination()->flags() & CL_MEM_USE_HOST_PTR) + pdst = (char *)e->destination()->host_ptr() + e->dst_offset(); + else + { + DSPBuffer *dst = (DSPBuffer*)e->destination()->deviceBuffer(device); + dst_addr = (DSPDevicePtr64)dst->data() + e->dst_offset(); + pdst = Driver::instance()->map(dst_addr, e->cb(), false); + } + + memcpy(pdst, psrc, e->cb()); + + if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR)) + Driver::instance()->unmap(psrc, src_addr, e->cb(), false); + + if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR)) + Driver::instance()->unmap(pdst, dst_addr, e->cb(), true); +#endif + break; + } + + case Event::ReadBufferRect: + case Event::WriteBufferRect: + { + ReadWriteBufferRectEvent *e = (ReadWriteBufferRectEvent *)event; + + // Calculate the start points for each block of memory referenced + DSPDevicePtr64 buf_start; + uint8_t * host_start; + + if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) + buf_start = (DSPDevicePtr64)e->buffer()->host_ptr(); + else + buf_start = ((DSPBuffer *)e->source()->deviceBuffer(device)) + ->data(); + + buf_start += e->src_origin(2) * e->src_slice_pitch() + + e->src_origin(1) * e->src_row_pitch() + + e->src_origin(0); + + host_start = (uint8_t *)e->ptr() + + e->dst_origin(2) * e->dst_slice_pitch() + + e->dst_origin(1) * e->dst_row_pitch() + + e->dst_origin(0); + + // Map the device/host buffers to the appopriate src/dst operands + // based on the requested operation (read vs write) + DSPDevicePtr64 src_start, dst_start; + + size_t src_row_pitch, dst_row_pitch; + size_t src_slice_pitch, dst_slice_pitch; + + if (t == Event::ReadBufferRect) + { + src_start = buf_start; + src_row_pitch = e->src_row_pitch(); + src_slice_pitch = e->src_slice_pitch(); + + dst_start = (DSPDevicePtr64) host_start; + dst_row_pitch = e->dst_row_pitch(); + dst_slice_pitch = e->dst_slice_pitch(); + } + else + { + src_start = (DSPDevicePtr64) host_start; + src_row_pitch = e->dst_row_pitch(); + src_slice_pitch = e->dst_slice_pitch(); + + dst_start = buf_start; + dst_row_pitch = e->src_row_pitch(); + dst_slice_pitch = e->src_slice_pitch(); + } + + // The dimensions of the region to be copied gives us our + // loop boundaries for copying + cl_ulong xdim = e->region(0); + cl_ulong ydim = e->region(1); + cl_ulong zdim = e->region(2); + + // Set up the start point + DSPDevicePtr64 src_cur_slice = src_start; + DSPDevicePtr64 dst_cur_slice = dst_start; + + // The outer loop handles each z-axis slice + // For 2-D copy, will only iterate once (zdim=1) + for(cl_uint z = 0; z < zdim; z++) + { + DSPDevicePtr64 src_cur_row = src_cur_slice; + DSPDevicePtr64 dst_cur_row = dst_cur_slice; + + // The inner loop handles each row of the current slice + for(cl_uint y = 0; y < ydim; y++) + { + // Copy a row + if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) + memcpy((void *)dst_cur_row, (void *)src_cur_row, xdim); + else + { + if (t == Event::ReadBufferRect) + Driver::instance()->read(device->dspID(), + src_cur_row, (uint8_t *)dst_cur_row, xdim); + else + Driver::instance()->write(device->dspID(), + dst_cur_row, (uint8_t *)src_cur_row, xdim); + } + + // Proceed to next row + src_cur_row += src_row_pitch; + dst_cur_row += dst_row_pitch; + } + + // Proceed to next slice + src_cur_slice += src_slice_pitch; + dst_cur_slice += dst_slice_pitch; + } + break; + } + + case Event::CopyBufferRect: + { +#ifdef DSPC868X + std::cerr << "Event type not yet supported" << std::endl; +#else + CopyBufferRectEvent *e = (CopyBufferRectEvent *)event; + + // Calculate the offsets into each buffer + size_t src_offset, dst_offset; + + src_offset = e->src_origin(2) * e->src_slice_pitch() + + e->src_origin(1) * e->src_row_pitch() + + e->src_origin(0); + + dst_offset = e->dst_origin(2) * e->dst_slice_pitch() + + e->dst_origin(1) * e->dst_row_pitch() + + e->dst_origin(0); + + // Set up start points for the copy. If it is a DSP buffer, we'll + // need to map the buffer before copying (done in copy loop below) + DSPDevicePtr64 src_start, dst_start; + + if (e->source()->flags() & CL_MEM_USE_HOST_PTR) + src_start = (DSPDevicePtr64)e->source()->host_ptr() + src_offset; + else + { + DSPBuffer *src = (DSPBuffer*)e->source()->deviceBuffer(device); + src_start = src->data() + src_offset; + } + + if (e->destination()->flags() & CL_MEM_USE_HOST_PTR) + dst_start = (DSPDevicePtr64)e->destination()->host_ptr() + dst_offset; + else + { + DSPBuffer *dst=(DSPBuffer*)e->destination()->deviceBuffer(device); + dst_start = dst->data() + dst_offset; + } + + // The dimensions of the region to be copied + cl_ulong xdim = e->region(0); + cl_ulong ydim = e->region(1); + cl_ulong zdim = e->region(2); + + // If we need to map memory we will currently map a slice + // at a time. So determine the size of a 2D slice + size_t src_slice_size = ydim * e->src_row_pitch()-e->src_origin(0); + size_t dst_slice_size = ydim * e->dst_row_pitch()-e->dst_origin(0); + + // Set up the initial copy point + DSPDevicePtr64 src_cur_slice = src_start; + DSPDevicePtr64 dst_cur_slice = dst_start; + + // The outer loop handles each z-axis slice + // For 2-D copy, will only iterate once (zdim=1) + for(cl_ulong z = 0; z < zdim; z++) + { + uint8_t *src_cur_row = (uint8_t *)src_cur_slice; + uint8_t *dst_cur_row = (uint8_t *)dst_cur_slice; + uint8_t *src_cur_mslice, *dst_cur_mslice; + + // If necessary, memory map a slice of buffer + if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR)) + src_cur_row = src_cur_mslice = (uint8_t *) + Driver::instance()->map(src_cur_slice, src_slice_size,true); + + if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR)) + dst_cur_row = dst_cur_mslice = (uint8_t *) + Driver::instance()->map(dst_cur_slice, dst_slice_size,false); + + // The inner loop handles each row of the current slice + for(cl_ulong y = 0; y < ydim; y++) + { + // Copy current row + memcpy(dst_cur_row, src_cur_row, xdim); + + // Proceed to next row + src_cur_row += e->src_row_pitch(); + dst_cur_row += e->dst_row_pitch(); + } + + // If necessary, unmap the current slice + if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR)) + Driver::instance()->unmap(src_cur_mslice, src_cur_slice, + src_slice_size, false); + + if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR)) + Driver::instance()->unmap(dst_cur_mslice, dst_cur_slice, + dst_slice_size, true); + + // Proceed to next slice + src_cur_slice += e->src_slice_pitch(); + dst_cur_slice += e->dst_slice_pitch(); + } +#endif + break; + } + + case Event::ReadImage: + case Event::WriteImage: + case Event::CopyImage: + case Event::CopyBufferToImage: + case Event::CopyImageToBuffer: + case Event::MapImage: + { + std::cerr << "Images are not supported" << std::endl; + break; + } + + case Event::MapBuffer: + { +#ifdef DSPC868X + std::cerr << "Event type not yet supported" << std::endl; +#endif + MapBufferEvent *e = (MapBufferEvent *)event; + + /*----------------------------------------------------------- + * for USE_HOST_PTR, the buffer store is already on the host and + * map should not be needed. + -----------------------------------------------------------*/ + if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) break; + + clRetainEvent((cl_event) e); + if(! e->buffer()->addMapEvent(e)) + ERR(1, "MapBuffer: Range conflicts with previous maps"); + if ((e->flags() & CL_MAP_READ) != 0) + { + DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device); + DSPDevicePtr64 data = (DSPDevicePtr64)buf->data() + e->offset(); + Driver::instance()->map(data, e->cb(), true); + } + break; + } + case Event::UnmapMemObject: + { +#ifdef DSPC868X + std::cerr << "Event type not yet supported" << std::endl; +#endif + UnmapBufferEvent *e = (UnmapBufferEvent *)event; + + /*----------------------------------------------------------- + * for USE_HOST_PTR, the buffer store is already on the host and + * unmap should not be needed. + -----------------------------------------------------------*/ + if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) break; + + if (e->buffer()->type() != Coal::MemObject::Buffer && + e->buffer()->type() != Coal::MemObject::SubBuffer) + ERR(1, "UnmapMemObject: MapImage/Unmap not support yet"); + MapBufferEvent *mbe = (MapBufferEvent *) + e->buffer()->removeMapEvent(e->mapping()); + if (mbe == NULL) + ERR(1, "UnmapMemObject: host_ptr not from previous maps"); + + if ((mbe->flags() & CL_MAP_WRITE) != 0) + { + DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device); + DSPDevicePtr64 buf_dsp_addr = (DSPDevicePtr64)buf->data(); + Driver::instance()->unmap(e->mapping(), buf_dsp_addr, + mbe->cb(), true); + } + if (queue) queue->releaseEvent(mbe); + break; + } + + case Event::NativeKernel: + { + std::cerr << "Native Kernels not supported on the DSP" << std::endl; + break; + } + + case Event::NDRangeKernel: + case Event::TaskKernel: + { + KernelEvent *e = (KernelEvent *) event; + DSPKernelEvent *ke = (DSPKernelEvent *)e->deviceData(); + + errcode = ke->run(t); + + /*----------------------------------------------------------------- + * Put the event on a pending completion list and its + * completion will be handled asynchronously. + *----------------------------------------------------------------*/ + if (errcode == CL_SUCCESS) + { + device->push_complete_pending(ke->kernel_id(), e); + return false; + } + break; + } + default: break; + } + + /*--------------------------------------------------------------------- + * Cleanup + *--------------------------------------------------------------------*/ + + // an event may be released once it is Complete + if (queue_props & CL_QUEUE_PROFILING_ENABLE) + event->updateTiming(Event::End); + event->setStatus((errcode == CL_SUCCESS) ? Event::Complete : + (Event::Status)errcode); + + return false; +} + +/****************************************************************************** +* dsp_worker +******************************************************************************/ +void *dsp_worker(void *data) +{ + DSPDevice *device = (DSPDevice *)data; + + while (true) + { + if (device->any_complete_pending() && device->mail_query()) + handle_event_completion(device); + + bool stop = device->stop(); + + if (!stop && device->availableEvent()) + stop |= handle_event_dispatch(device); + + if (stop && !device->any_complete_pending()) break; + } +} diff --git a/src/core/events.cpp b/src/core/events.cpp new file mode 100644 index 0000000..629a0c9 --- /dev/null +++ b/src/core/events.cpp @@ -0,0 +1,1519 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file events.cpp + * \brief Events inheriting \c Coal::Event + */ + +#include "events.h" +#include "commandqueue.h" +#include "memobject.h" +#include "kernel.h" +#include "deviceinterface.h" + +#include <cstdlib> +#include <cstring> +#include <iostream> + +using namespace Coal; + +/* + * Read/Write buffers + */ + +BufferEvent::BufferEvent(CommandQueue *parent, + MemObject *buffer, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret), + p_buffer(buffer) +{ + clRetainMemObject((cl_mem) p_buffer); + + if (*errcode_ret != CL_SUCCESS) return; + + // Correct buffer + if (!buffer) + { + *errcode_ret = CL_INVALID_MEM_OBJECT; + return; + } + + // Buffer's context must match the CommandQueue one + Context *ctx = 0; + *errcode_ret = parent->info(CL_QUEUE_CONTEXT, sizeof(Context *), &ctx, 0); + + if (*errcode_ret != CL_SUCCESS) return; + + if ((Context *)buffer->parent() != ctx) + { + *errcode_ret = CL_INVALID_CONTEXT; + return; + } + + // Alignment of SubBuffers + DeviceInterface *device = 0; + *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), + &device, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (!isSubBufferAligned(buffer, device)) + { + *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; + return; + } + + // Allocate the buffer for the device + if (!buffer->allocate(device)) + { + *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE; + return; + } +} + +BufferEvent::~BufferEvent() +{ + clReleaseMemObject((cl_mem) p_buffer); +} + +MemObject *BufferEvent::buffer() const +{ + return p_buffer; +} + +bool BufferEvent::isSubBufferAligned(const MemObject *buffer, + const DeviceInterface *device) +{ + cl_uint align; + cl_int rs; + + if (buffer->type() != MemObject::SubBuffer) + return true; + + rs = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), + &align, 0); + + if (rs != CL_SUCCESS) + return false; + + size_t mask = 0; + if (align != 0) mask = align - 1; + + if (((SubBuffer *)buffer)->offset() & mask) + return false; + + return true; +} + +ReadWriteBufferEvent::ReadWriteBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret), + p_offset(offset), p_cb(cb), p_ptr(ptr) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Check for out-of-bounds reads + if (!ptr) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if (offset + cb > buffer->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +size_t ReadWriteBufferEvent::offset() const +{ + return p_offset; +} + +size_t ReadWriteBufferEvent::cb() const +{ + return p_cb; +} + +void *ReadWriteBufferEvent::ptr() const +{ + return p_ptr; +} + +ReadBufferEvent::ReadBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteBufferEvent(parent, buffer, offset, cb, ptr, num_events_in_wait_list, + event_wait_list, errcode_ret) +{} + +Event::Type ReadBufferEvent::type() const +{ + return Event::ReadBuffer; +} + +WriteBufferEvent::WriteBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteBufferEvent(parent, buffer, offset, cb, ptr, num_events_in_wait_list, + event_wait_list, errcode_ret) +{} + +Event::Type WriteBufferEvent::type() const +{ + return Event::WriteBuffer; +} + +MapBufferEvent::MapBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + cl_map_flags map_flags, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret), + p_offset(offset), p_cb(cb), p_map_flags(map_flags) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Check flags + if (map_flags & ~(CL_MAP_READ | CL_MAP_WRITE)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check for out-of-bounds values + if (offset + cb > buffer->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +Event::Type MapBufferEvent::type() const +{ + return Event::MapBuffer; +} + +size_t MapBufferEvent::offset() const +{ + return p_offset; +} + +size_t MapBufferEvent::cb() const +{ + return p_cb; +} + +cl_map_flags MapBufferEvent::flags() const +{ + return p_map_flags; +} + +void *MapBufferEvent::ptr() const +{ + return p_ptr; +} + +void MapBufferEvent::setPtr(void *ptr) +{ + p_ptr = ptr; +} + +MapImageEvent::MapImageEvent(CommandQueue *parent, + Image2D *image, + cl_map_flags map_flags, + const size_t origin[3], + const size_t region[3], + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: BufferEvent (parent, image, num_events_in_wait_list, event_wait_list, errcode_ret) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Check flags + if (map_flags & ~(CL_MAP_READ | CL_MAP_WRITE)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Copy the vectors + if (origin) + std::memcpy(&p_origin, origin, 3 * sizeof(size_t)); + else + std::memset(&p_origin, 0, 3 * sizeof(size_t)); + + for (unsigned int i=0; i<3; ++i) + { + if (!region[i]) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + p_region[i] = region[i]; + } + + // Multiply the elements (for images) + p_region[0] *= image->pixel_size(); + p_origin[0] *= image->pixel_size(); + + // Check for overflow + if (image->type() == MemObject::Image2D && + (origin[2] != 0 || region[2] != 1)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check for out-of-bounds + if ((p_origin[0] + p_region[0]) > image->row_pitch() || + (p_origin[1] + p_region[1]) * image->row_pitch() > image->slice_pitch() || + (p_origin[2] + p_region[2]) * image->slice_pitch() > image->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +Event::Type MapImageEvent::type() const +{ + return Event::MapImage; +} + + +cl_map_flags MapImageEvent::flags() const +{ + return p_map_flags; +} + +size_t MapImageEvent::origin (unsigned int index) const +{ + return p_origin[index]; +} + +size_t MapImageEvent::region (unsigned int index) const +{ + return p_region[index]; +} + +size_t MapImageEvent::row_pitch() const +{ + return p_row_pitch; +} + +size_t MapImageEvent::slice_pitch() const +{ + return p_slice_pitch; +} + +void *MapImageEvent::ptr() const +{ + return p_ptr; +} + +void MapImageEvent::setRowPitch (size_t row_pitch) +{ + p_row_pitch = row_pitch; +} + +void MapImageEvent::setSlicePitch (size_t slice_pitch) +{ + p_slice_pitch = slice_pitch; +} + +void MapImageEvent::setPtr (void *ptr) +{ + p_ptr = ptr; +} + +UnmapBufferEvent::UnmapBufferEvent(CommandQueue *parent, + MemObject *buffer, + void *mapped_addr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret), + p_mapping(mapped_addr) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // TODO: Check that p_mapping is ok (will be done in the drivers) + if (!mapped_addr) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +Event::Type UnmapBufferEvent::type() const +{ + return Event::UnmapMemObject; +} + +void *UnmapBufferEvent::mapping() const +{ + return p_mapping; +} + +CopyBufferEvent::CopyBufferEvent(CommandQueue *parent, + MemObject *source, + MemObject *destination, + size_t src_offset, + size_t dst_offset, + size_t cb, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: BufferEvent(parent, source, num_events_in_wait_list, event_wait_list, + errcode_ret), p_destination(destination), p_src_offset(src_offset), + p_dst_offset(dst_offset), p_cb(cb) +{ + clRetainMemObject((cl_mem) p_destination); + + if (*errcode_ret != CL_SUCCESS) return; + + if (!destination) + { + *errcode_ret = CL_INVALID_MEM_OBJECT; + return; + } + + // Check for out-of-bounds + if (src_offset + cb > source->size() || + dst_offset + cb > destination->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check for overlap + if (source == destination) + { + if ((src_offset < dst_offset && src_offset + cb > dst_offset) || + (dst_offset < src_offset && dst_offset + cb > src_offset)) + { + *errcode_ret = CL_MEM_COPY_OVERLAP; + return; + } + } + + // Check alignement of destination + DeviceInterface *device = 0; + *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), + &device, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (!isSubBufferAligned(destination, device)) + { + *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; + return; + } + + // Allocate the buffer for the device + if (!destination->allocate(device)) + { + *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE; + return; + } +} + +CopyBufferEvent::~CopyBufferEvent() +{ + clReleaseMemObject((cl_mem) p_destination); +} + +MemObject *CopyBufferEvent::source() const +{ + return buffer(); +} + +MemObject *CopyBufferEvent::destination() const +{ + return p_destination; +} + +size_t CopyBufferEvent::src_offset() const +{ + return p_src_offset; +} + +size_t CopyBufferEvent::dst_offset() const +{ + return p_dst_offset; +} + +size_t CopyBufferEvent::cb() const +{ + return p_cb; +} + +Event::Type CopyBufferEvent::type() const +{ + return Event::CopyBuffer; +} + +/* + * Native kernel + */ +NativeKernelEvent::NativeKernelEvent(CommandQueue *parent, + void (*user_func)(void *), + void *args, + size_t cb_args, + cl_uint num_mem_objects, + const MemObject **mem_list, + const void **args_mem_loc, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: Event (parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret), + p_user_func((void *)user_func), p_args(0) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Parameters sanity + if (!user_func) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if (!args && (cb_args || num_mem_objects)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if (args && !cb_args) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if (num_mem_objects && (!mem_list || !args_mem_loc)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if (!num_mem_objects && (mem_list || args_mem_loc)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check that the device can execute a native kernel + DeviceInterface *device; + cl_device_exec_capabilities caps; + + *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), + &device, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + *errcode_ret = device->info(CL_DEVICE_EXECUTION_CAPABILITIES, + sizeof(cl_device_exec_capabilities), &caps, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if ((caps & CL_EXEC_NATIVE_KERNEL) == 0) + { + *errcode_ret = CL_INVALID_OPERATION; + return; + } + + // Copy the arguments in a new list + if (cb_args) + { + p_args = std::malloc(cb_args); + + if (!p_args) + { + *errcode_ret = CL_OUT_OF_HOST_MEMORY; + return; + } + + std::memcpy((void *)p_args, (void *)args, cb_args); + + // Replace memory objects with global pointers + for (cl_uint i=0; i<num_mem_objects; ++i) + { + const MemObject *buffer = mem_list[i]; + const char *loc = (const char *)args_mem_loc[i]; + + if (!buffer) + { + *errcode_ret = CL_INVALID_MEM_OBJECT; + return; + } + + // We need to do relocation : loc is in args, we need it in p_args + size_t delta = (char *)p_args - (char *)args; + loc += delta; + + *(void **)loc = buffer->deviceBuffer(device)->nativeGlobalPointer(); + } + } +} + +NativeKernelEvent::~NativeKernelEvent() +{ + if (p_args) + std::free((void *)p_args); +} + +Event::Type NativeKernelEvent::type() const +{ + return Event::NativeKernel; +} + +void *NativeKernelEvent::function() const +{ + return p_user_func; +} + +void *NativeKernelEvent::args() const +{ + return p_args; +} + +/* + * Kernel event + */ +KernelEvent::KernelEvent(CommandQueue *parent, + Kernel *kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret), + p_work_dim(work_dim), p_kernel(kernel) +{ + clRetainKernel((cl_kernel) p_kernel); + + if (*errcode_ret != CL_SUCCESS) return; + + *errcode_ret = CL_SUCCESS; + + // Sanity checks + if (!kernel) + { + *errcode_ret = CL_INVALID_KERNEL; + return; + } + + // Check that the kernel was built for parent's device. + DeviceInterface *device; + Context *k_ctx, *q_ctx; + size_t max_work_group_size; + cl_uint max_dims = 0; + + *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), + &device, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + *errcode_ret = parent->info(CL_QUEUE_CONTEXT, sizeof(Context *), &q_ctx, 0); + *errcode_ret |= kernel->info(CL_KERNEL_CONTEXT, sizeof(Context *), &k_ctx, 0); + *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), + &max_work_group_size, 0); + *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(size_t), + &max_dims, 0); + *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_SIZES, + max_dims * sizeof(size_t), p_max_work_item_sizes, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + p_dev_kernel = kernel->deviceDependentKernel(device); + + if (!p_dev_kernel) + { + *errcode_ret = CL_INVALID_PROGRAM_EXECUTABLE; + return; + } + + // Check that contexts match + if (k_ctx != q_ctx) + { + *errcode_ret = CL_INVALID_CONTEXT; + return; + } + + // Check args + if (!kernel->argsSpecified()) + { + *errcode_ret = CL_INVALID_KERNEL_ARGS; + return; + } + + // Check dimension + if (work_dim == 0 || work_dim > max_dims) + { + *errcode_ret = CL_INVALID_WORK_DIMENSION; + return; + } + + // Populate work_offset, work_size and local_work_size + size_t work_group_size = 1; + boost::tuple <uint,uint,uint> reqd_work_group_size( + kernel->reqdWorkGroupSize(kernel->deviceDependentModule(device))); + + uint reqd_x = reqd_work_group_size.get<0>(); + uint reqd_y = reqd_work_group_size.get<1>(); + uint reqd_z = reqd_work_group_size.get<2>(); + bool reqd_any = reqd_x > 0 || reqd_y > 0 || reqd_z > 0; + + if (reqd_any) + { + // if __attribute__((reqd_work_group_size(X, Y, Z))) is set and local size not specified + if (!local_work_size) + { + *errcode_ret = CL_INVALID_WORK_GROUP_SIZE; + return; + } + + // if __attribute__((reqd_work_group_size(X, Y, Z))) doesn't match + else + { + if (( local_work_size[0] != reqd_x) || + (work_dim > 1 && local_work_size[1] != reqd_y) || + (work_dim > 2 && local_work_size[2] != reqd_z)) + { + *errcode_ret = CL_INVALID_WORK_GROUP_SIZE; + return; + } + } + } + + cl_uint i; + for (i=0; i<work_dim; ++i) + { + if (global_work_offset) + { + p_global_work_offset[i] = global_work_offset[i]; + } + else + { + p_global_work_offset[i] = 0; + } + + if (!global_work_size || !global_work_size[i]) + { + *errcode_ret = CL_INVALID_GLOBAL_WORK_SIZE; + } + p_global_work_size[i] = global_work_size[i]; + + if (!local_work_size) + { + // Guess the best value according to the device + p_local_work_size[i] = + p_dev_kernel->guessWorkGroupSize(work_dim, i, global_work_size[i]); + } + else + { + // Check divisibility + if ((global_work_size[i] % local_work_size[i]) != 0) + { + *errcode_ret = CL_INVALID_WORK_GROUP_SIZE; + return; + } + + // Not too big ? + if (local_work_size[i] > p_max_work_item_sizes[i]) + { + *errcode_ret = CL_INVALID_WORK_ITEM_SIZE; + return; + } + + p_local_work_size[i] = local_work_size[i]; + work_group_size *= local_work_size[i]; + } + } + // initialize missing dimensions + for (; i < max_dims; i++) + { + p_global_work_offset[i] = 0; + p_global_work_size[i] = 1; + p_local_work_size[i] = 1; + } + + // Check we don't ask too much to the device + if (work_group_size > max_work_group_size) + { + *errcode_ret = CL_INVALID_WORK_GROUP_SIZE; + return; + } + + // Check arguments (buffer alignment, image size, ...) + for (unsigned int i=0; i<kernel->numArgs(); ++i) + { + const Kernel::Arg *a = kernel->arg(i); + + if (a->kind() == Kernel::Arg::Buffer && a->file() != Kernel::Arg::Local) + { + const MemObject *buffer = *(const MemObject **)(a->value(0)); + + if (!BufferEvent::isSubBufferAligned(buffer, device)) + { + *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; + return; + } + } + else if (a->kind() == Kernel::Arg::Image2D) + { + const Image2D *image = *(const Image2D **)(a->value(0)); + size_t maxWidth, maxHeight; + + *errcode_ret = device->info(CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(size_t), &maxWidth, 0); + *errcode_ret |= device->info(CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(size_t), &maxHeight, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (image->width() > maxWidth || image->height() > maxHeight) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + } + else if (a->kind() == Kernel::Arg::Image3D) + { + const Image3D *image = *(const Image3D **)a->value(0); + size_t maxWidth, maxHeight, maxDepth; + + *errcode_ret = device->info(CL_DEVICE_IMAGE3D_MAX_WIDTH, + sizeof(size_t), &maxWidth, 0); + *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_HEIGHT, + sizeof(size_t), &maxHeight, 0); + *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_DEPTH, + sizeof(size_t), &maxDepth, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (image->width() > maxWidth || image->height() > maxHeight || + image->depth() > maxDepth) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + } + } +} + +KernelEvent::~KernelEvent() +{ + clReleaseKernel((cl_kernel) p_kernel); +} + +cl_uint KernelEvent::work_dim() const +{ + return p_work_dim; +} + +size_t KernelEvent::global_work_offset(cl_uint dim) const +{ + return p_global_work_offset[dim]; +} + +size_t KernelEvent::global_work_size(cl_uint dim) const +{ + return p_global_work_size[dim]; +} + +size_t KernelEvent::local_work_size(cl_uint dim) const +{ + return p_local_work_size[dim]; +} + +Kernel *KernelEvent::kernel() const +{ + return p_kernel; +} + +DeviceKernel *KernelEvent::deviceKernel() const +{ + return p_dev_kernel; +} + +Event::Type KernelEvent::type() const +{ + return Event::NDRangeKernel; +} + +static size_t one = 1; + +TaskEvent::TaskEvent(CommandQueue *parent, + Kernel *kernel, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: KernelEvent(parent, kernel, 1, 0, &one, &one, num_events_in_wait_list, + event_wait_list, errcode_ret) +{ + // TODO: CL_INVALID_WORK_GROUP_SIZE if + // __attribute__((reqd_work_group_size(X, Y, Z))) != (1, 1, 1) +} + +Event::Type TaskEvent::type() const +{ + return Event::TaskKernel; +} + +/* + * User event + */ +UserEvent::UserEvent(Context *context, cl_int *errcode_ret) +: Event(0, Submitted, 0, 0, errcode_ret), p_context(context) +{} + +Event::Type UserEvent::type() const +{ + return Event::User; +} + +Context *UserEvent::context() const +{ + return p_context; +} + +/* + * ReadWriteBufferRectEvent + */ +ReadWriteCopyBufferRectEvent::ReadWriteCopyBufferRectEvent(CommandQueue *parent, + MemObject *source, + const size_t src_origin[3], + const size_t dst_origin[3], + const size_t region[3], + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + unsigned int bytes_per_element, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: BufferEvent (parent, source, num_events_in_wait_list, event_wait_list, + errcode_ret) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Copy the vectors + if (src_origin) + std::memcpy(&p_src_origin, src_origin, 3 * sizeof(size_t)); + else + std::memset(&p_src_origin, 0, 3 * sizeof(size_t)); + + if (dst_origin) + std::memcpy(&p_dst_origin, dst_origin, 3 * sizeof(size_t)); + else + std::memset(&p_dst_origin, 0, 3 * sizeof(size_t)); + + for (unsigned int i=0; i<3; ++i) + { + if (!region[i]) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + p_region[i] = region[i]; + } + + // Multiply the elements (for images) + p_region[0] *= bytes_per_element; + p_src_origin[0] *= bytes_per_element; + p_dst_origin[0] *= bytes_per_element; + + // Compute the pitches + p_src_row_pitch = p_region[0]; + + if (src_row_pitch) + { + if (src_row_pitch < p_src_row_pitch) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + p_src_row_pitch = src_row_pitch; + } + + p_src_slice_pitch = p_region[1] * p_src_row_pitch; + + if (src_slice_pitch) + { + if (src_slice_pitch < p_src_slice_pitch) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + p_src_slice_pitch = src_slice_pitch; + } + + p_dst_row_pitch = p_region[0]; + + if (dst_row_pitch) + { + if (dst_row_pitch < p_dst_row_pitch) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + p_dst_row_pitch = dst_row_pitch; + } + + p_dst_slice_pitch = p_region[1] * p_dst_row_pitch; + + if (dst_slice_pitch) + { + if (dst_slice_pitch < p_dst_slice_pitch) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + p_dst_slice_pitch = dst_slice_pitch; + } +} + +size_t ReadWriteCopyBufferRectEvent::src_origin(unsigned int index) const +{ + return p_src_origin[index]; +} + +size_t ReadWriteCopyBufferRectEvent::dst_origin(unsigned int index) const +{ + return p_dst_origin[index]; +} + +size_t ReadWriteCopyBufferRectEvent::region(unsigned int index) const +{ + return p_region[index]; +} + +size_t ReadWriteCopyBufferRectEvent::src_row_pitch() const +{ + return p_src_row_pitch; +} + +size_t ReadWriteCopyBufferRectEvent::src_slice_pitch() const +{ + return p_src_slice_pitch; +} + +size_t ReadWriteCopyBufferRectEvent::dst_row_pitch() const +{ + return p_dst_row_pitch; +} + +size_t ReadWriteCopyBufferRectEvent::dst_slice_pitch() const +{ + return p_dst_slice_pitch; +} + +MemObject *ReadWriteCopyBufferRectEvent::source() const +{ + return buffer(); +} + +CopyBufferRectEvent::CopyBufferRectEvent(CommandQueue *parent, + MemObject *source, + MemObject *destination, + const size_t src_origin[3], + const size_t dst_origin[3], + const size_t region[3], + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + unsigned int bytes_per_element, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteCopyBufferRectEvent(parent, source, src_origin, dst_origin, region, + src_row_pitch, src_slice_pitch, dst_row_pitch, + dst_slice_pitch, bytes_per_element, + num_events_in_wait_list, event_wait_list, errcode_ret), + p_destination(destination) +{ + if (*errcode_ret != CL_SUCCESS) return; + + if (!destination) + { + *errcode_ret = CL_INVALID_MEM_OBJECT; + return; + } + + // Check for out-of-bounds + if ((p_src_origin[0] + p_region[0]) > p_src_row_pitch || + (p_src_origin[1] + p_region[1]) * p_src_row_pitch > p_src_slice_pitch || + (p_src_origin[2] + p_region[2]) * p_src_slice_pitch > source->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if ((p_dst_origin[0] + p_region[0]) > p_dst_row_pitch || + (p_dst_origin[1] + p_region[1]) * p_dst_row_pitch > p_dst_slice_pitch || + (p_dst_origin[2] + p_region[2]) * p_dst_slice_pitch > destination->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check for overlapping + if (source == destination) + { + unsigned char overlapping_dimensions = 0; + + for (unsigned int i=0; i<3; ++i) + { + if ((p_dst_origin[i] < p_src_origin[i] && p_dst_origin[i] + p_region[i] > p_src_origin[i]) || + (p_src_origin[i] < p_dst_origin[i] && p_src_origin[i] + p_region[i] > p_dst_origin[i])) + overlapping_dimensions++; + } + + if (overlapping_dimensions == 3) + { + // If all the dimensions are overlapping, the region is overlapping + *errcode_ret = CL_MEM_COPY_OVERLAP; + return; + } + } + + // Check alignment of destination (source already checked by BufferEvent) + DeviceInterface *device = 0; + *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), + &device, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (!isSubBufferAligned(destination, device)) + { + *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; + return; + } + + // Allocate the buffer for the device + if (!destination->allocate(device)) + { + *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE; + return; + } +} + +Event::Type CopyBufferRectEvent::type() const +{ + return Event::CopyBufferRect; +} + +MemObject *CopyBufferRectEvent::destination() const +{ + return p_destination; +} + +ReadWriteBufferRectEvent::ReadWriteBufferRectEvent(CommandQueue *parent, + MemObject *buffer, + const size_t buffer_origin[3], + const size_t host_origin[3], + const size_t region[3], + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void *ptr, + unsigned int bytes_per_element, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteCopyBufferRectEvent(parent, buffer, buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, + host_row_pitch, host_slice_pitch, bytes_per_element, + num_events_in_wait_list, event_wait_list, errcode_ret), + p_ptr(ptr) +{ + if (*errcode_ret != CL_SUCCESS) return; + + if (!ptr) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check for out-of-bounds + if ((p_src_origin[0] + p_region[0]) > p_src_row_pitch || + (p_src_origin[1] + p_region[1]) * p_src_row_pitch > p_src_slice_pitch || + (p_src_origin[2] + p_region[2]) * p_src_slice_pitch > buffer->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +void *ReadWriteBufferRectEvent::ptr() const +{ + return p_ptr; +} + +ReadBufferRectEvent::ReadBufferRectEvent (CommandQueue *parent, + MemObject *buffer, + const size_t buffer_origin[3], + const size_t host_origin[3], + const size_t region[3], + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteBufferRectEvent(parent, buffer, buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, + host_slice_pitch, ptr, 1, num_events_in_wait_list, + event_wait_list, errcode_ret) +{ +} + +Event::Type ReadBufferRectEvent::type() const +{ + return ReadBufferRect; +} + +WriteBufferRectEvent::WriteBufferRectEvent (CommandQueue *parent, + MemObject *buffer, + const size_t buffer_origin[3], + const size_t host_origin[3], + const size_t region[3], + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteBufferRectEvent (parent, buffer, buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, + host_slice_pitch, ptr, 1, num_events_in_wait_list, + event_wait_list, errcode_ret) +{ +} + +Event::Type WriteBufferRectEvent::type() const +{ + return WriteBufferRect; +} + +ReadWriteImageEvent::ReadWriteImageEvent (CommandQueue *parent, + Image2D *image, + const size_t origin[3], + const size_t region[3], + size_t row_pitch, + size_t slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteBufferRectEvent(parent, image, origin, 0, region, image->row_pitch(), + image->slice_pitch(), row_pitch, slice_pitch, ptr, + image->pixel_size(), num_events_in_wait_list, + event_wait_list, errcode_ret) +{ + if (*errcode_ret != CL_SUCCESS) return; + + if (image->type() == MemObject::Image2D && + (origin[2] != 0 || region[2] != 1)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +ReadImageEvent::ReadImageEvent(CommandQueue *parent, + Image2D *image, + const size_t origin[3], + const size_t region[3], + size_t row_pitch, + size_t slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteImageEvent(parent, image, origin, region, row_pitch, slice_pitch, ptr, + num_events_in_wait_list, event_wait_list, errcode_ret) +{} + +Event::Type ReadImageEvent::type() const +{ + return Event::ReadImage; +} + +WriteImageEvent::WriteImageEvent(CommandQueue *parent, + Image2D *image, + const size_t origin[3], + const size_t region[3], + size_t row_pitch, + size_t slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: ReadWriteImageEvent (parent, image, origin, region, row_pitch, slice_pitch, ptr, + num_events_in_wait_list, event_wait_list, errcode_ret) +{} + +Event::Type WriteImageEvent::type() const +{ + return Event::WriteImage; +} + +static bool operator!=(const cl_image_format &a, const cl_image_format &b) +{ + return (a.image_channel_data_type != b.image_channel_data_type) || + (a.image_channel_order != b.image_channel_order); +} + +CopyImageEvent::CopyImageEvent(CommandQueue *parent, + Image2D *source, + Image2D *destination, + const size_t src_origin[3], + const size_t dst_origin[3], + const size_t region[3], + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: CopyBufferRectEvent (parent, source, destination, src_origin, dst_origin, + region, source->row_pitch(), source->slice_pitch(), + destination->row_pitch(), destination->slice_pitch(), + source->pixel_size(), num_events_in_wait_list, + event_wait_list, errcode_ret) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Check bounds + if (source->type() == MemObject::Image2D && + (src_origin[2] != 0 || region[2] != 1)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if (destination->type() == MemObject::Image2D && + (dst_origin[2] != 0 || region[2] != 1)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Formats must match + if (source->format() != destination->format()) + { + *errcode_ret = CL_IMAGE_FORMAT_MISMATCH; + return; + } +} + +Event::Type CopyImageEvent::type() const +{ + return Event::CopyImage; +} + +CopyImageToBufferEvent::CopyImageToBufferEvent(CommandQueue *parent, + Image2D *source, + MemObject *destination, + const size_t src_origin[3], + const size_t region[3], + size_t dst_offset, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: CopyBufferRectEvent(parent, source, destination, src_origin, 0, region, + source->row_pitch(), source->slice_pitch(), 0, 0, + source->pixel_size(), num_events_in_wait_list, + event_wait_list, errcode_ret), + p_offset(dst_offset) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Check for buffer overflow + size_t dst_cb = region[2] * region[1] * region[0] * source->pixel_size(); + + if (dst_offset + dst_cb > destination->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check validity + if (source->type() == MemObject::Image2D && + (src_origin[2] != 0 || region[2] != 1)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +size_t CopyImageToBufferEvent::offset() const +{ + return p_offset; +} + +Event::Type CopyImageToBufferEvent::type() const +{ + return Event::CopyImageToBuffer; +} + +CopyBufferToImageEvent::CopyBufferToImageEvent(CommandQueue *parent, + MemObject *source, + Image2D *destination, + size_t src_offset, + const size_t dst_origin[3], + const size_t region[3], + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: CopyBufferRectEvent(parent, source, destination, 0, dst_origin, region, 0, 0, + destination->row_pitch(), destination->slice_pitch(), + destination->pixel_size(), num_events_in_wait_list, + event_wait_list, errcode_ret), + p_offset(src_offset) +{ + if (*errcode_ret != CL_SUCCESS) return; + + // Check for buffer overflow + size_t src_cb = region[2] * region[1] * region[0] * destination->pixel_size(); + + if (src_offset + src_cb > source->size()) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check validity + if (destination->type() == MemObject::Image2D && + (dst_origin[2] != 0 || region[2] != 1)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } +} + +size_t CopyBufferToImageEvent::offset() const +{ + return p_offset; +} + +Event::Type CopyBufferToImageEvent::type() const +{ + return Event::CopyBufferToImage; +} + +/* + * Barrier + */ + +BarrierEvent::BarrierEvent(CommandQueue *parent, cl_int *errcode_ret) +: Event(parent, Queued, 0, 0, errcode_ret) +{} + +Event::Type BarrierEvent::type() const +{ + return Event::Barrier; +} + +/* + * WaitForEvents + */ + +WaitForEventsEvent::WaitForEventsEvent(CommandQueue *parent, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret) +{} + +Event::Type WaitForEventsEvent::type() const +{ + return Event::WaitForEvents; +} + +/* + * Marker + */ +MarkerEvent::MarkerEvent(CommandQueue *parent, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret) +: WaitForEventsEvent(parent, num_events_in_wait_list, event_wait_list, errcode_ret) +{} + +Event::Type MarkerEvent::type() const +{ + return Event::Marker; +} diff --git a/src/core/events.h b/src/core/events.h new file mode 100644 index 0000000..2311d92 --- /dev/null +++ b/src/core/events.h @@ -0,0 +1,718 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file events.h + * \brief All the event-related classes + */ + +#ifndef __EVENTS_H__ +#define __EVENTS_H__ + +#include "commandqueue.h" +#include <core/config.h> + +#include <vector> + +namespace Coal +{ + +class MemObject; +class Image2D; +class Kernel; +class DeviceKernel; +class DeviceInterface; + +/** + * \brief Buffer-related event + */ +class BufferEvent : public Event +{ + public: + BufferEvent(CommandQueue *parent, + MemObject *buffer, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + virtual ~BufferEvent(); + + MemObject *buffer() const; /*!< \brief Buffer on which to operate */ + + /** + * \brief Check that a buffer is correctly aligned for a device + * + * OpenCL supports sub-buffers of buffers (\c Coal::SubBuffer). They + * have to be aligned on a certain device-dependent boundary. + * + * This function checks that \p buffer is correctly aligned for + * \p device. If \p buffer is not a \c Coal::SubBuffer, this function + * returns true. + * + * \return true if the buffer is aligned or not a \c Coal::SubBuffer + */ + static bool isSubBufferAligned(const MemObject *buffer, + const DeviceInterface *device); + + private: + MemObject *p_buffer; +}; + +/** + * \brief Reading or writing to a buffer + */ +class ReadWriteBufferEvent : public BufferEvent +{ + public: + ReadWriteBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + size_t offset() const; /*!< \brief Offset in the buffer of the operation, in bytes */ + size_t cb() const; /*!< \brief Number of bytes to read or write */ + void *ptr() const; /*!< \brief Pointer in host memory at which to put the data */ + + private: + size_t p_offset, p_cb; + void *p_ptr; +}; + +/** + * \brief Reading a buffer + */ +class ReadBufferEvent : public ReadWriteBufferEvent +{ + public: + ReadBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadBuffer one */ +}; + +/** + * \brief Writing a buffer + */ +class WriteBufferEvent : public ReadWriteBufferEvent +{ + public: + WriteBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteBuffer one */ +}; + +/** + * \brief Mapping a buffer + */ +class MapBufferEvent : public BufferEvent +{ + public: + MapBufferEvent(CommandQueue *parent, + MemObject *buffer, + size_t offset, + size_t cb, + cl_map_flags map_flags, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::MapBuffer one */ + + size_t offset() const; /*!< \brief Offset in the buffer at which the mapping begins, in bytes */ + size_t cb() const; /*!< \brief Number of bytes to map */ + cl_map_flags flags() const; /*!< \brief Flags of the mapping */ + void *ptr() const; /*!< \brief Pointer at which the data has been mapped */ + + /** + * \brief Set the memory location at which the data has been mapped + * + * This function is called by the device when it has successfully mapped + * the buffer. It must be called inside + * \c Coal::DeviceInterface::initEventDeviceData(). + * + * \param ptr the address at which the buffer has been mapped + */ + void setPtr(void *ptr); + + private: + size_t p_offset, p_cb; + cl_map_flags p_map_flags; + void *p_ptr; +}; + +/** + * \brief Mapping an image + */ +class MapImageEvent : public BufferEvent +{ + public: + MapImageEvent(CommandQueue *parent, + Image2D *image, + cl_map_flags map_flags, + const size_t origin[3], + const size_t region[3], + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::MapImage one */ + + /** + * \brief Origin of the mapping, in pixels, for the given dimension + * \param index dimension for which the origin is retrieved + * \return origin of the mapping for the given dimension + */ + size_t origin(unsigned int index) const; + + /** + * \brief Region of the mapping, in pixels, for the given dimension + * \param index dimension for which the region is retrieved + * \return region of the mapping for the given dimension + */ + size_t region(unsigned int index) const; + cl_map_flags flags() const; /*!< \brief Flags of the mapping */ + + void *ptr() const; /*!< \brief Pointer at which the data is mapped */ + size_t row_pitch() const; /*!< \brief Row pitch of the mapped data */ + size_t slice_pitch() const; /*!< \brief Slice pitch of the mapped data */ + + /** + * \brief Set the memory location at which the image is mapped + * + * This function must be called by + * \c Coal::DeviceInterface::initEventDeviceData(). Row and slice pitches + * must also be set by this function by calling \c setRowPitch() and + * \c setSlicePitch(). + * + * \param ptr pointer at which the data is available + */ + void setPtr(void *ptr); + void setRowPitch(size_t row_pitch); /*!< \brief Set row pitch */ + void setSlicePitch(size_t slice_pitch); /*!< \brief Set slice pitch */ + + private: + cl_map_flags p_map_flags; + size_t p_origin[3], p_region[3]; + void *p_ptr; + size_t p_slice_pitch, p_row_pitch; +}; + +/** + * \brief Unmapping a memory object + */ +class UnmapBufferEvent : public BufferEvent +{ + public: + UnmapBufferEvent(CommandQueue *parent, + MemObject *buffer, + void *mapped_addr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::UnmapBuffer one */ + + void *mapping() const; /*!< \brief Mapped address to unmap */ + + private: + void *p_mapping; +}; + +/** + * \brief Copying between two buffers + */ +class CopyBufferEvent : public BufferEvent +{ + public: + CopyBufferEvent(CommandQueue *parent, + MemObject *source, + MemObject *destination, + size_t src_offset, + size_t dst_offset, + size_t cb, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + ~CopyBufferEvent(); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBuffer one */ + + MemObject *source() const; /*!< \brief Source buffer, equivalent to \c Coal::BufferEvent::buffer() */ + MemObject *destination() const; /*!< \brief Destination buffer */ + size_t src_offset() const; /*!< \brief Offset in the source buffer, in bytes */ + size_t dst_offset() const; /*!< \brief Offset in the destination buffer, in bytes */ + size_t cb() const; /*!< \brief Number of bytes to copy */ + + private: + MemObject *p_destination; + size_t p_src_offset, p_dst_offset, p_cb; +}; + +/** + * \brief Events related to rectangular (or cubic) memory regions + * + * This event is the base for all the *BufferRect events, and the Image ones. + */ +class ReadWriteCopyBufferRectEvent : public BufferEvent +{ + public: + ReadWriteCopyBufferRectEvent(CommandQueue *parent, + MemObject *source, + const size_t src_origin[3], + const size_t dst_origin[3], + const size_t region[3], + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + unsigned int bytes_per_element, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + size_t src_origin(unsigned int index) const; /*!< \brief Source origin for the \p index dimension */ + size_t dst_origin(unsigned int index) const; /*!< \brief Destination origin for the \p index dimension */ + size_t region(unsigned int index) const; /*!< \brief Region to copy for the \p index dimension */ + size_t src_row_pitch() const; /*!< \brief Source row pitch */ + size_t src_slice_pitch() const; /*!< \brief Source slice pitch */ + size_t dst_row_pitch() const; /*!< \brief Destination row pitch */ + size_t dst_slice_pitch() const; /*!< \brief Destination slice pitch */ + MemObject *source() const; /*!< \brief Source of the copy, for readability. Calls \c Coal::BufferEvent::buffer(). */ + + protected: + size_t p_src_origin[3], p_dst_origin[3], p_region[3]; + size_t p_src_row_pitch, p_src_slice_pitch; + size_t p_dst_row_pitch, p_dst_slice_pitch; +}; + +/** + * \brief Copying between two buffers + */ +class CopyBufferRectEvent : public ReadWriteCopyBufferRectEvent +{ + public: + CopyBufferRectEvent(CommandQueue *parent, + MemObject *source, + MemObject *destination, + const size_t src_origin[3], + const size_t dst_origin[3], + const size_t region[3], + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + unsigned int bytes_per_element, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBufferRect one */ + MemObject *destination() const; /*!< \brief Destination buffer */ + + private: + MemObject *p_destination; +}; + +/** + * \brief Reading or writing to a buffer + */ +class ReadWriteBufferRectEvent : public ReadWriteCopyBufferRectEvent +{ + public: + ReadWriteBufferRectEvent(CommandQueue *parent, + MemObject *buffer, + const size_t buffer_origin[3], + const size_t host_origin[3], + const size_t region[3], + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void *ptr, + unsigned int bytes_per_element, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + void *ptr() const; /*!< \brief Pointer in host memory in which to put the data */ + + private: + void *p_ptr; +}; + +/** + * \brief Reading a buffer + */ +class ReadBufferRectEvent : public ReadWriteBufferRectEvent +{ + public: + ReadBufferRectEvent(CommandQueue *parent, + MemObject *buffer, + const size_t buffer_origin[3], + const size_t host_origin[3], + const size_t region[3], + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadBufferRect one */ +}; + +/** + * \brief Writing a buffer + */ +class WriteBufferRectEvent : public ReadWriteBufferRectEvent +{ + public: + WriteBufferRectEvent(CommandQueue *parent, + MemObject *buffer, + const size_t buffer_origin[3], + const size_t host_origin[3], + const size_t region[3], + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteBufferRect one */ +}; + +/** + * \brief Reading or writing images + * + * This class only converts some of the arguments given to its constructor + * to the one of \c Coal::ReadWriteBufferRectEvent. For example, the source row + * and slice pitches are read from the \c Coal::Image2D object. + */ +class ReadWriteImageEvent : public ReadWriteBufferRectEvent +{ + public: + ReadWriteImageEvent(CommandQueue *parent, + Image2D *image, + const size_t origin[3], + const size_t region[3], + size_t row_pitch, + size_t slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); +}; + +/** + * \brief Reading an image + */ +class ReadImageEvent : public ReadWriteImageEvent +{ + public: + ReadImageEvent(CommandQueue *parent, + Image2D *image, + const size_t origin[3], + const size_t region[3], + size_t row_pitch, + size_t slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadImage one */ +}; + +/** + * \brief Writing to an image + */ +class WriteImageEvent : public ReadWriteImageEvent +{ + public: + WriteImageEvent(CommandQueue *parent, + Image2D *image, + const size_t origin[3], + const size_t region[3], + size_t row_pitch, + size_t slice_pitch, + void *ptr, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteImage one */ +}; + +/** + * \brief Copying between two images + */ +class CopyImageEvent : public CopyBufferRectEvent +{ + public: + CopyImageEvent(CommandQueue *parent, + Image2D *source, + Image2D *destination, + const size_t src_origin[3], + const size_t dst_origin[3], + const size_t region[3], + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyImage one */ +}; + +/** + * \brief Copying an image to a buffer + */ +class CopyImageToBufferEvent : public CopyBufferRectEvent +{ + public: + CopyImageToBufferEvent(CommandQueue *parent, + Image2D *source, + MemObject *destination, + const size_t src_origin[3], + const size_t region[3], + size_t dst_offset, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + size_t offset() const; /*!< \brief Offset in the buffer at which writing the image */ + Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyImageToBuffer one */ + + private: + size_t p_offset; +}; + +/** + * \brief Copying a buffer to an image + */ +class CopyBufferToImageEvent : public CopyBufferRectEvent +{ + public: + CopyBufferToImageEvent(CommandQueue *parent, + MemObject *source, + Image2D *destination, + size_t src_offset, + const size_t dst_origin[3], + const size_t region[3], + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + size_t offset() const; /*!< \brief Offset in the buffer at which the copy starts */ + Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBufferToImage one */ + + private: + size_t p_offset; +}; + +/** + * \brief Executing a native function as a kernel + * + * This event builds an argument list to give to the native function. It needs + * for example to replace all occurence of a \c Coal::MemObject by a pointer + * to data the host CPU can actually access, using + * \c Coal::DeviceBuffer::nativeGlobalPointer(). + */ +class NativeKernelEvent : public Event +{ + public: + NativeKernelEvent(CommandQueue *parent, + void (*user_func)(void *), + void *args, + size_t cb_args, + cl_uint num_mem_objects, + const MemObject **mem_list, + const void **args_mem_loc, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + ~NativeKernelEvent(); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::NativeKernel one */ + + void *function() const; /*!< \brief Host function to call */ + void *args() const; /*!< \brief Args to give to the host function */ + + private: + void *p_user_func; + void *p_args; +}; + +/** + * \brief Executing a compiled kernel + */ +class KernelEvent : public Event +{ + public: + KernelEvent(CommandQueue *parent, + Kernel *kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + ~KernelEvent(); + + cl_uint work_dim() const; /*!< \brief Number of working dimensions */ + size_t global_work_offset(cl_uint dim) const; /*!< \brief Global work offset for the \p dim dimension */ + size_t global_work_size(cl_uint dim) const; /*!< \brief Global work size for the \p dim dimension */ + size_t local_work_size(cl_uint dim) const; /*!< \brief Number of work-items per work-group for the \p dim dimension */ + Kernel *kernel() const; /*!< \brief \c Coal::Kernel object to run */ + DeviceKernel *deviceKernel() const; /*!< \brief \c Coal::DeviceKernel for the kernel and device of this event */ + + virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::NDRangeKernel one */ + + private: + cl_uint p_work_dim; + size_t p_global_work_offset[MAX_WORK_DIMS], + p_global_work_size[MAX_WORK_DIMS], + p_local_work_size[MAX_WORK_DIMS], + p_max_work_item_sizes[MAX_WORK_DIMS]; + Kernel *p_kernel; + DeviceKernel *p_dev_kernel; +}; + +/** + * \brief Executing a task kernel + * + * This event is simple a \c Coal::KernelEvent with: + * + * - \c work_dim() set to 1 + * - \c global_work_offset() set to {0} + * - \c global_work_size() set to {1} + * - \c local_work_size() set to {1} + * + * It's in fact a \c Coal::KernelEvent containing only one single work-item. + */ +class TaskEvent : public KernelEvent +{ + public: + TaskEvent(CommandQueue *parent, + Kernel *kernel, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::TaskKernel one */ +}; + +/** + * \brief User event + * + * This event is a bit special as it is created by a call to + * \c clCreateUserEvent() and doesn't belong to an event queue. Thus, a mean had + * to be found for all to work. + * + * The solution is the \c addDependentCommandQueue() function, called every time + * the user event is added to a command queue. When this event becomes completed, + * \c flushQueues() is called to allow all the \c Coal::CommandQueue objects + * containing this event to push more events on their device. + * + * This way, command queues are not blocked by user events. + */ +class UserEvent : public Event +{ + public: + UserEvent(Context *context, cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::User one */ + Context *context() const; /*!< \brief Context of this event */ + + private: + Context *p_context; +}; + +/** + * \brief Barrier event + */ +class BarrierEvent : public Event +{ + public: + BarrierEvent(CommandQueue *parent, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::Barrier one */ +}; + +/** + * \brief Event waiting for others to complete before being completed + */ +class WaitForEventsEvent : public Event +{ + public: + WaitForEventsEvent(CommandQueue *parent, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::WaitForEvents one */ +}; + +/** + * \brief Marker event + */ +class MarkerEvent : public WaitForEventsEvent +{ + public: + MarkerEvent(CommandQueue *parent, + cl_uint num_events_in_wait_list, + const Event **event_wait_list, + cl_int *errcode_ret); + + Type type() const; /*!< \brief Say the event is a \c Coal::Event::Marker one */ +}; + +} + +#endif diff --git a/src/core/icd.cpp b/src/core/icd.cpp new file mode 100644 index 0000000..2c62035 --- /dev/null +++ b/src/core/icd.cpp @@ -0,0 +1,145 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "CL/cl.h" +#include "platform.h" +#include "icd.h" + +void * dispatch_table[] = +{ + (void*) clGetPlatformIDs, + (void*) clGetPlatformInfo, + (void*) clGetDeviceIDs, + (void*) clGetDeviceInfo, + (void*) clCreateContext, + (void*) clCreateContextFromType, + (void*) clRetainContext, + (void*) clReleaseContext, + (void*) clGetContextInfo, + (void*) clCreateCommandQueue, + (void*) clRetainCommandQueue, + (void*) clReleaseCommandQueue, + (void*) clGetCommandQueueInfo, + (void*) 0, //clSetCommandQueueProperty, + (void*) clCreateBuffer, + (void*) clCreateImage2D, + (void*) clCreateImage3D, + (void*) clRetainMemObject, + (void*) clReleaseMemObject, + (void*) clGetSupportedImageFormats, + (void*) clGetMemObjectInfo, + (void*) clGetImageInfo, + (void*) clCreateSampler, + (void*) clRetainSampler, + (void*) clReleaseSampler, + (void*) clGetSamplerInfo, + (void*) clCreateProgramWithSource, + (void*) clCreateProgramWithBinary, + (void*) clRetainProgram, + (void*) clReleaseProgram, + (void*) clBuildProgram, + (void*) clUnloadCompiler, + (void*) clGetProgramInfo, + (void*) clGetProgramBuildInfo, + (void*) clCreateKernel, + (void*) clCreateKernelsInProgram, + (void*) clRetainKernel, + (void*) clReleaseKernel, + (void*) clSetKernelArg, + (void*) clGetKernelInfo, + (void*) clGetKernelWorkGroupInfo, + (void*) clWaitForEvents, + (void*) clGetEventInfo, + (void*) clRetainEvent, + (void*) clReleaseEvent, + (void*) clGetEventProfilingInfo, + (void*) clFlush, + (void*) clFinish, + (void*) clEnqueueReadBuffer, + (void*) clEnqueueWriteBuffer, + (void*) clEnqueueCopyBuffer, + (void*) clEnqueueReadImage, + (void*) clEnqueueWriteImage, + (void*) clEnqueueCopyImage, + (void*) clEnqueueCopyImageToBuffer, + (void*) clEnqueueCopyBufferToImage, + (void*) clEnqueueMapBuffer, + (void*) clEnqueueMapImage, + (void*) clEnqueueUnmapMemObject, + (void*) clEnqueueNDRangeKernel, + (void*) clEnqueueTask, + (void*) clEnqueueNativeKernel, + (void*) clEnqueueMarker, + (void*) clEnqueueWaitForEvents, + (void*) clEnqueueBarrier, + (void*) clGetExtensionFunctionAddress, + (void*) 0, //clCreateFromGLBuffer, + (void*) 0, //clCreateFromGLTexture2D, + (void*) 0, //clCreateFromGLTexture3D, + (void*) 0, //clCreateFromGLRenderbuffer, + (void*) 0, //clGetGLObjectInfo, + (void*) 0, //clGetGLTextureInfo, + (void*) 0, //clEnqueueAcquireGLObjects, + (void*) 0, //clEnqueueReleaseGLObjects, + (void*) 0, //clGetGLContextInfoKHR, + (void*) 0, //clGetDeviceIDsFromD3D10KHR, + (void*) 0, //clCreateFromD3D10BufferKHR, + (void*) 0, //clCreateFromD3D10Texture2DKHR, + (void*) 0, //clCreateFromD3D10Texture3DKHR, + (void*) 0, //clEnqueueAcquireD3D10ObjectsKHR, + (void*) 0, //clEnqueueReleaseD3D10ObjectsKHR, + (void*) clSetEventCallback, + (void*) clCreateSubBuffer, + (void*) clSetMemObjectDestructorCallback, + (void*) clCreateUserEvent, + (void*) clSetUserEventStatus, + (void*) clEnqueueReadBufferRect, + (void*) clEnqueueWriteBufferRect, + (void*) clEnqueueCopyBufferRect, + (void*) 0, //clCreateSubDevicesEXT, + (void*) 0, //clRetainDeviceEXT, + (void*) 0, //clReleaseDeviceEXT +}; + + +cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint num_entries, + cl_platform_id * platforms, + cl_uint * num_platforms) +{ + if (num_platforms) *num_platforms = 1; + else if (!platforms) return CL_INVALID_VALUE; + + if (!num_entries && platforms) return CL_INVALID_VALUE; + + /*------------------------------------------------------------------------- + * Only one "default" platform + *------------------------------------------------------------------------*/ + if (platforms != 0) *platforms = &the_platform; + + return CL_SUCCESS; +} diff --git a/src/core/icd.h b/src/core/icd.h new file mode 100644 index 0000000..591aed6 --- /dev/null +++ b/src/core/icd.h @@ -0,0 +1,44 @@ +/****************************************************************************** + * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef _ICD_H +#define _ICD_H +#include "CL/cl.h" + +typedef void *(KHRicdVendorDispatch)[]; +extern KHRicdVendorDispatch dispatch_table; + +class Dispatch +{ + public: + Dispatch() : dispatch(&dispatch_table) {} + private: + KHRicdVendorDispatch *dispatch; +}; + +#endif // _ICD_H + diff --git a/src/core/kernel.cpp b/src/core/kernel.cpp new file mode 100644 index 0000000..4c53576 --- /dev/null +++ b/src/core/kernel.cpp @@ -0,0 +1,637 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file core/kernel.cpp + * \brief Kernel + */ + +#include "kernel.h" +#include "propertylist.h" +#include "program.h" +#include "memobject.h" +#include "sampler.h" +#include "deviceinterface.h" + +#include <string> +#include <iostream> +#include <cstring> +#include <cstdio> +#include <cstdlib> +#include <boost/tuple/tuple.hpp> + +#include <llvm/Support/Casting.h> +#include <llvm/IR/Module.h> +#include <llvm/IR/Type.h> +#include <llvm/IR/DerivedTypes.h> +#include <llvm/IR/Constants.h> +#include <llvm/IR/Metadata.h> +#include <llvm/IR/DataLayout.h> + + +using namespace Coal; +Kernel::Kernel(Program *program) +: Object(Object::T_Kernel, program), p_has_locals(false), wi_alloca_size(0) +{ + // TODO: Say a kernel is attached to the program (that becomes unalterable) + + null_dep.device = 0; + null_dep.kernel = 0; + null_dep.function = 0; + null_dep.module = 0; + p_name = ""; +} + +Kernel::~Kernel() +{ + while (p_device_dependent.size()) + { + DeviceDependent &dep = p_device_dependent.back(); + + delete dep.kernel; + + p_device_dependent.pop_back(); + } +} + +const Kernel::DeviceDependent &Kernel::deviceDependent(DeviceInterface *device) const +{ + for (size_t i=0; i<p_device_dependent.size(); ++i) + { + const DeviceDependent &rs = p_device_dependent[i]; + + if (rs.device == device || (!device && p_device_dependent.size() == 1)) + return rs; + } + + return null_dep; +} + +Kernel::DeviceDependent &Kernel::deviceDependent(DeviceInterface *device) +{ + for (size_t i=0; i<p_device_dependent.size(); ++i) + { + DeviceDependent &rs = p_device_dependent[i]; + + if (rs.device == device || (!device && p_device_dependent.size() == 1)) + return rs; + } + + return null_dep; +} + +/****************************************************************************** +* cl_int Kernel::addFunction +******************************************************************************/ +cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function, + llvm::Module *module) +{ + llvm::DataLayout TD(module); + +#if 0 // Uncomment to see the Function IR being generated: + function->dump(); +#endif + + p_name = function->getName().str(); + + // Get wi_alloca_size, to be used for computing wg_alloca_size + std::string fattrs = function->getAttributes().getAsString( + llvm::AttributeSet::FunctionIndex); + std::size_t found = fattrs.find("_wi_alloca_size="); + if (found != std::string::npos) + wi_alloca_size = atoi(fattrs.data() + found + 16); + + /*------------------------------------------------------------------------- + * Add a device dependent + *------------------------------------------------------------------------*/ + DeviceDependent dep; + + dep.device = device; + dep.function = function; + dep.module = module; + + /*------------------------------------------------------------------------- + * Build the arg list of the kernel (or verify it if a previous function + * was already registered) + *------------------------------------------------------------------------*/ + llvm::FunctionType *f = function->getFunctionType(); + bool append = (p_args.size() == 0); + + if (!append && p_args.size() != f->getNumParams()) + return CL_INVALID_KERNEL_DEFINITION; + + int i = 0; + for (llvm::Function::arg_iterator I = function->arg_begin(), + E = function->arg_end(); I != E; ++I, i++) + { + llvm::Type *param_type = f->getParamType(i); + llvm::Argument *arg = I; + Arg::Kind kind = Arg::Invalid; + Arg::File file = Arg::Private; + unsigned short vec_dim = 1; + + llvm::Type *arg_type = arg->getType(); + const unsigned arg_store_size = TD.getTypeStoreSize(arg_type); + + // LLVM IR writes parameters passed by value as pointers: + if (llvm::isa<llvm::PointerType>(arg_type) && arg->hasByValAttr()) { + arg_type = llvm::dyn_cast<llvm::PointerType>(arg_type)->getElementType(); + } + + llvm::Type *itype = TD.getSmallestLegalIntType(module->getContext(), arg_store_size * 8); + llvm::Type *target_type = (itype != NULL && arg_type->isIntegerTy()) ? itype : arg_type; + + unsigned target_size = TD.getTypeStoreSize(target_type); + unsigned target_align = TD.getABITypeAlignment(target_type); + +#if 0 // Uncomment to see arg info + arg_type->dump(); std::cout << " Size: " << target_size << " Align: " << target_align << std::endl ; +#endif + + if (arg_type->isPointerTy()) + { + // It's a pointer, dereference it + llvm::PointerType *p_type = llvm::cast<llvm::PointerType>(arg_type); + + file = (Arg::File)p_type->getAddressSpace(); + arg_type = p_type->getElementType(); + + // If it's a __local argument, we'll have to allocate memory at run time + if (file == Arg::Local) + p_has_locals = true; + + kind = Arg::Buffer; + + // If it's a struct, get its name + if (arg_type->isStructTy()) + { + llvm::StructType *struct_type = + llvm::cast<llvm::StructType>(arg_type); + std::string struct_name = struct_type->getName().str(); + + if (struct_name.compare(0, 14, "struct.image2d") == 0) + { + kind = Arg::Image2D; + file = Arg::Global; + } + else if (struct_name.compare(0, 14, "struct.image3d") == 0) + { + kind = Arg::Image3D; + file = Arg::Global; + } + } + } + else + { + if (arg_type->isVectorTy()) + { + // It's a vector, we need its element's type + llvm::VectorType *v_type = llvm::cast<llvm::VectorType>(arg_type); + + vec_dim = v_type->getNumElements(); + arg_type = v_type->getElementType(); + } + + // Get type kind + if (arg_type->isFloatTy()) + { + kind = Arg::Float; + } + else if (arg_type->isDoubleTy()) + { + kind = Arg::Double; + } + else if (arg_type->isIntegerTy()) + { + llvm::IntegerType *i_type = llvm::cast<llvm::IntegerType>(arg_type); + + if (i_type->getBitWidth() == 8) + { + kind = Arg::Int8; + } + else if (i_type->getBitWidth() == 16) + { + kind = Arg::Int16; + } + else if (i_type->getBitWidth() == 32) + { + // NOTE: May also be a sampler, check done in setArg + kind = Arg::Int32; + } + else if (i_type->getBitWidth() == 64) + { + kind = Arg::Int64; + } + } + } + + // Check if we recognized the type + if (kind == Arg::Invalid) + return CL_INVALID_KERNEL_DEFINITION; + + // Create arg + Arg *a= new Arg(vec_dim, file, kind, target_align); + + // If we also have a function registered, check for signature compliance + if (!append && (a) != p_args[i]) + return CL_INVALID_KERNEL_DEFINITION; + + // Append arg if needed + if (append) + p_args.push_back(a); + } + + dep.kernel = device->createDeviceKernel(this, dep.function); + p_device_dependent.push_back(dep); + + return CL_SUCCESS; +} + +llvm::Function *Kernel::function(DeviceInterface *device) const +{ + const DeviceDependent &dep = deviceDependent(device); + + return dep.function; +} + +/****************************************************************************** +* cl_int Kernel::setArg +******************************************************************************/ +cl_int Kernel::setArg(cl_uint index, size_t size, const void *value) +{ + if (index > p_args.size()) + return CL_INVALID_ARG_INDEX; + + Arg *arg = p_args[index]; + + /*------------------------------------------------------------------------- + * Special case for __local pointers + *------------------------------------------------------------------------*/ + if (arg->file() == Arg::Local) + { + if (size == 0) return CL_INVALID_ARG_SIZE; + if (value != 0) return CL_INVALID_ARG_VALUE; + + arg->setAllocAtKernelRuntime(size); + return CL_SUCCESS; + } + + /*------------------------------------------------------------------------- + * Check that size corresponds to the arg type + *------------------------------------------------------------------------*/ + size_t arg_size = arg->valueSize() * arg->vecDim(); + + /*------------------------------------------------------------------------- + * Special case for samplers (pointers in C++, uint32 in OpenCL). + *------------------------------------------------------------------------*/ + if (size == sizeof(cl_sampler) && arg_size == 4 && + (*(Object **)value)->isA(T_Sampler)) + { + unsigned int bitfield = (*(Sampler **)value)->bitfield(); + + arg->refineKind(Arg::Sampler); + arg->alloc(); + arg->loadData(&bitfield, size); + + return CL_SUCCESS; + } + + // LLVM IR redefines function parameter types to fit the smallest integer type width for the ABI + // eg: <2xi8> (2 bytes) may actually be pushed as an i32 (4 bytes!), but this knowledge is + // not known to shamrock. But, we do know the parameter type alignment in addFunction(). + // So allow sizes less than or equal to the target alignment to succeed the size test: + if ((size != arg_size) && (size > arg->targetAlignment())) return CL_INVALID_ARG_SIZE; + + /*------------------------------------------------------------------------- + * Check for null values + *------------------------------------------------------------------------*/ + cl_mem null_mem = 0; + + if (!value) + { + switch (arg->kind()) + { + /*------------------------------------------------------------- + * Special case buffers : value can be 0 (or point to 0) + *------------------------------------------------------------*/ + case Arg::Buffer: + case Arg::Image2D: + case Arg::Image3D: value = &null_mem; + default: return CL_INVALID_ARG_VALUE; + } + } + + /*------------------------------------------------------------------------- + * Copy just the data actually passed. Expect LLVM to do the signext/zeroext. + *------------------------------------------------------------------------*/ + arg->alloc(); + arg->loadData(value, size); + + return CL_SUCCESS; +} + +unsigned int Kernel::numArgs() const +{ + return p_args.size(); +} + +const Kernel::Arg *Kernel::arg(unsigned int index) const +{ + return p_args.at(index); +} + +bool Kernel::argsSpecified() const +{ + for (size_t i=0; i<p_args.size(); ++i) + if (!p_args[i]->defined()) return false; + return true; +} + +bool Kernel::hasLocals() const +{ + return p_has_locals; +} + +DeviceKernel *Kernel::deviceDependentKernel(DeviceInterface *device) const +{ + const DeviceDependent &dep = deviceDependent(device); + + return dep.kernel; +} + +llvm::Module *Kernel::deviceDependentModule(DeviceInterface *device) const +{ + const DeviceDependent &dep = deviceDependent(device); + + return dep.module; +} + +cl_int Kernel::info(cl_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union { + cl_uint cl_uint_var; + cl_program cl_program_var; + cl_context cl_context_var; + }; + + switch (param_name) + { + case CL_KERNEL_FUNCTION_NAME: + MEM_ASSIGN(p_name.size() + 1, p_name.c_str()); + break; + + case CL_KERNEL_NUM_ARGS: + SIMPLE_ASSIGN(cl_uint, p_args.size()); + break; + + case CL_KERNEL_REFERENCE_COUNT: + SIMPLE_ASSIGN(cl_uint, references()); + break; + + case CL_KERNEL_CONTEXT: + SIMPLE_ASSIGN(cl_context, parent()->parent()); + break; + + case CL_KERNEL_PROGRAM: + SIMPLE_ASSIGN(cl_program, parent()); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +boost::tuple<uint,uint,uint> Kernel::reqdWorkGroupSize(llvm::Module *module) const +{ + llvm::NamedMDNode *kernels = module->getNamedMetadata("opencl.kernels"); + + boost::tuple<uint,uint,uint> zeros(0,0,0); + + if (!kernels) return zeros; + + for (unsigned int i=0; i<kernels->getNumOperands(); ++i) + { + llvm::MDNode *node = kernels->getOperand(i); + + /*--------------------------------------------------------------------- + * Each node has only one operand : a llvm::Function + *--------------------------------------------------------------------*/ + llvm::Value *value = node->getOperand(0); + + /*--------------------------------------------------------------------- + * Bug somewhere, don't crash + *--------------------------------------------------------------------*/ + if (!llvm::isa<llvm::Function>(value)) continue; + + llvm::Function *f = llvm::cast<llvm::Function>(value); + if(f->getName().str() != p_name) continue; + + if (node->getNumOperands() <= 1) return zeros; + + llvm::MDNode *meta = llvm::cast<llvm::MDNode>(node->getOperand(1)); + if (meta->getNumOperands() == 4 && + meta->getOperand(0)->getName().str() == std::string("reqd_work_group_size")) + { + uint x = llvm::cast<llvm::ConstantInt> (meta->getOperand(1))->getValue().getLimitedValue(); + uint y = llvm::cast<llvm::ConstantInt> (meta->getOperand(2))->getValue().getLimitedValue(); + uint z = llvm::cast<llvm::ConstantInt> (meta->getOperand(3))->getValue().getLimitedValue(); + + return boost::tuple<uint,uint,uint> (x,y,z); + } + return zeros; + } +} + + +cl_int Kernel::workGroupInfo(DeviceInterface *device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union { + size_t size_t_var; + size_t three_size_t[3]; + cl_ulong cl_ulong_var; + }; + + const DeviceDependent &dep = deviceDependent(device); + + // BUG? Shouldn't we check if the kernel is associated with + // the default device ? + if (!device && p_device_dependent.size() > 1) + return CL_INVALID_DEVICE; + + switch (param_name) + { + case CL_KERNEL_WORK_GROUP_SIZE: + SIMPLE_ASSIGN(size_t, dep.kernel->workGroupSize()); + break; + + case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: + { + boost::tuple<uint,uint,uint> res(reqdWorkGroupSize(dep.module)); + three_size_t[0] = res.get<0>(); + three_size_t[1] = res.get<1>(); + three_size_t[2] = res.get<2>(); + value = &three_size_t; + value_length = sizeof(three_size_t); + } + break; + + case CL_KERNEL_LOCAL_MEM_SIZE: + SIMPLE_ASSIGN(cl_ulong, dep.kernel->localMemSize()); + break; + + case CL_KERNEL_PRIVATE_MEM_SIZE: + SIMPLE_ASSIGN(cl_ulong, dep.kernel->privateMemSize()); + break; + + case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: + SIMPLE_ASSIGN(size_t, dep.kernel->preferredWorkGroupSizeMultiple()); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +/* + * Kernel::Arg + */ +Kernel::Arg::Arg(unsigned short vec_dim, File file, Kind kind, size_t targ_align) + : p_vec_dim(vec_dim), p_file(file), p_kind(kind), p_targ_align(targ_align), p_data(0), p_defined(false), + p_runtime_alloc(0) +{ } + +Kernel::Arg::~Arg() +{ + if (p_data) std::free(p_data); +} + +void Kernel::Arg::alloc() +{ + if (!p_data) p_data = std::calloc(p_vec_dim, valueSize()); +} + +void Kernel::Arg::loadData(const void *data, size_t size) +{ + assert ( size <= p_vec_dim * valueSize()); + std::memcpy(p_data, data, size); + p_defined = true; +} + +void Kernel::Arg::setAllocAtKernelRuntime(size_t size) +{ + p_runtime_alloc = size; + p_defined = true; +} + +void Kernel::Arg::refineKind (Kernel::Arg::Kind kind) +{ + p_kind = kind; +} + +bool Kernel::Arg::operator!=(const Arg &b) +{ + bool same = (p_vec_dim == b.p_vec_dim) && + (p_file == b.p_file) && + (p_kind == b.p_kind); + + return !same; +} + +size_t Kernel::Arg::valueSize() const +{ + switch (p_kind) + { + case Invalid: return 0; + case Int8: return 1; + case Int16: return 2; + case Int32: + case Sampler: return 4; + case Int64: return 8; + case Float: return sizeof(cl_float); + case Double: return sizeof(double); + case Buffer: + case Image2D: + case Image3D: return sizeof(cl_mem); + } + + return 0; +} + +unsigned short Kernel::Arg::vecDim() const { return p_vec_dim; } +Kernel::Arg::File Kernel::Arg::file() const { return p_file; } +Kernel::Arg::Kind Kernel::Arg::kind() const { return p_kind; } +size_t Kernel::Arg::targetAlignment() const { return p_targ_align; } +bool Kernel::Arg::defined() const { return p_defined; } +const void * Kernel::Arg::data() const { return p_data; } +size_t Kernel::Arg::allocAtKernelRuntime() const {return p_runtime_alloc;} + +const void *Kernel::Arg::value(unsigned short index) const +{ + const char *data = (const char *)p_data; + unsigned int offset = index * valueSize(); + + data += offset; + + return (const void *)data; +} + diff --git a/src/core/kernel.h b/src/core/kernel.h new file mode 100644 index 0000000..80672ea --- /dev/null +++ b/src/core/kernel.h @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file core/kernel.h + * \brief Kernel + */ + +#ifndef __KERNEL_H__ +#define __KERNEL_H__ + +#include "object.h" + +#include <CL/cl.h> + +#include <vector> +#include <string> +#include <boost/tuple/tuple.hpp> + +namespace llvm +{ + class Function; + class Module; +} + +namespace Coal +{ + +class Program; +class DeviceInterface; +class DeviceKernel; + +/** + * \brief Kernel + * + * A kernel represents a LLVM function that can be run on a device. As + * \c Coal::Kernel objects are device-independent, they in fact represent only + * the name of a kernel and the arguments the application wants to pass to it, + * but it also contains a list of LLVM functions for each device for which its + * parent \c Coal::Program has been built + */ +class Kernel : public Object +{ + public: + /** + * \brief Constructor + * \param program Parent \c Coal::Program + */ + Kernel(Program *program); + ~Kernel(); + + /** + * \brief Kernel argument + * + * This class holds OpenCL-related information about the arguments of + * a kernel. It is also used to check that a kernel takes the same + * arguments on every device on which it has been built. + */ + class Arg + { + public: + /** + * \brief Memory address space qualifier + */ + enum File + { + Private = 0, /*!< \brief __private */ +#if 1 + Global = 1, /*!< \brief __global */ + Constant = 2, /*!< \brief __constant */ + Local = 3 /*!< \brief __local */ +#else + /* using clang defaults */ + Global = 0xFFFF00, /*!< \brief __global */ + Local = 0xFFFF01, /*!< \brief __local */ + Constant = 0xFFFF02 /*!< \brief __constant */ +#endif + }; + + /** + * \brief Kind of argument (its datatype) + */ + enum Kind + { + Invalid, /*!< \brief Invalid argument */ + Int8, /*!< \brief \c uchar or \c char, \c i8 in LLVM */ + Int16, /*!< \brief \c ushort or \c short, \c i16 in LLVM */ + Int32, /*!< \brief \c uint or \c int, \c i32 in LLVM */ + Int64, /*!< \brief \c ulong or \c long, \c i64 in LLVM */ + Float, /*!< \brief \c float, \c float in LLVM */ + Double, /*!< \brief \c double, \c double in LLVM */ + Buffer, /*!< \brief \c Coal::Buffer or \c Coal::SubBuffer, <tt>type*</tt> in LLVM */ + Image2D, /*!< \brief \c Coal::Image2D, <tt>\%struct.image2d*</tt> in LLVM */ + Image3D, /*!< \brief \c Coal::Image3D, <tt>\%struct.image3d*</tt> in LLVM */ + Sampler /*!< \brief \c Coal::Sampler::bitfield(), \c i32 in LLVM, see \c Coal::Kernel::setArg() */ + }; + + /** + * \brief Constructor + * \param vec_dim vector dimension of the argument, 1 if not a vector + * \param file \c File of the argument + * \param kind \c Kind of the argument + * \param kind \c Argument type alignment (ABI specific) + */ + Arg(unsigned short vec_dim, File file, Kind kind, size_t targ_align); + ~Arg(); + + /** + * \brief Allocate the argument + * + * This function must be called before \c loadData(). It + * allocates a buffer in which the argument value can be stored. + * + * \sa valueSize() + */ + void alloc(); + + /** + * \brief Load a value into the argument + * \note \c alloc() must have been called before this function. + * \sa valueSize() + */ + void loadData(const void *data, size_t size); + + /** + * \brief Set the number of bytes that must be allocated at run-time + * + * \c __local arguments don't take a value given by the host + * application, but take pointers allocated on the device + * for each work-group. + * + * This function allows to set the size of the device-allocated + * memory buffer used by this argument. + * + * \param size size in byte of the buffer the device has to + * allocate for each work-group of this kernel + */ + void setAllocAtKernelRuntime(size_t size); + + /** + * \brief Changes the \c Kind of this argument + * \param kind new \c Kind + */ + void refineKind(Kind kind); + + /** + * \brief Compares this argument with another + * + * They are different if they \c vec_dim, \c file or \c kind are + * not the same. + * + * \param b other argument to compare + * \return true if the this arguments doesn't match \p b + */ + bool operator !=(const Arg &b); + + /** + * \brief Size of a field of this arg + * + * This function returns the size of this argument based on its + * \c Kind + * + * \note This size is not multiplied by \c vecDim(), you must do + * this by yourself to find the total space taken by this + * arg. + * \return the size of this argument, in bytes, without any padding + */ + size_t valueSize() const; + unsigned short vecDim() const; /*!< \brief Vector dimension */ + File file() const; /*!< \brief File */ + Kind kind() const; /*!< \brief Kind */ + bool defined() const; /*!< \brief Has the value of this argument already beed loaded by the host application ? */ + size_t targetAlignment() const; /*!< \brief Get alignment (bytes) of arg type */ + size_t allocAtKernelRuntime() const; /*!< \brief Size of the \c __local buffer to allocate at kernel runtime */ + const void *value(unsigned short index) const; /*!< \brief Pointer to the value of this argument, for the \p index vector element */ + const void *data() const; /*!< \brief Pointer to the data of this arg, equivalent to <tt>value(0)</tt> */ + + private: + unsigned short p_vec_dim; + File p_file; + Kind p_kind; + void *p_data; + bool p_defined; + size_t p_runtime_alloc; + size_t p_targ_align; + }; + + /** + * \brief Add a \c llvm::Function to this kernel + * + * This function adds a \c llvm::Function to this kernel for the + * specified \p device. It also has the responsibility to find the + * \c Arg::Kind of each of the function's arguments. + * + * LLVM provides a \c llvm::Type for each argument: + * + * - If it is a pointer, the kind of the argument is \c Arg::Buffer and + * its field is a simple cast from a LLVM \c addrspace to \c Arg::File. + * - If it is a pointer to a struct whose name is either + * <tt>\%struct.image2d</tt> or <tt>\%struct.image3d</tt>, kind is set + * to \c Arg::Image2D or \c Arg::Image3D, respectively. + * - If it is a vector, \c vec_dim is set to the vector size, and the + * rest of the computations are done on the element type + * - Then we translate the LLVM type to an \c Arg::Kind. For instance, + * \c i32 becomes \c Arg::Int32 + * + * Samplers aren't detected at this stage because they are plain \c i32 + * types on the LLVM side. They are detected in \c setArg() when the + * value being set to the argument appears to be a \c Coal::Sampler. + * + * \param device device for which the function is added + * \param function function to add + * \param module LLVM module of this function + */ + cl_int addFunction(DeviceInterface *device, llvm::Function *function, + llvm::Module *module); + + /** + * \brief Get the LLVM function for a specified \p device + * \param device the device for which a LLVM function is needed + * \return the LLVM function for the given \p device + */ + llvm::Function *function(DeviceInterface *device) const; + + /** + * \brief Set the value of an argument + * + * See the constructor's documentation for a note on the + * \c Coal::Sampler objects + * + * \param index index of the argument + * \param size size of the value being stored in the argument, must match + * <tt>Arg::valueSize() * Arg::vecDim()</tt> + * \param value pointer to the data that will be copied in the argument + * \return \c CL_SUCCESS if success, an error code otherwise + */ + cl_int setArg(cl_uint index, size_t size, const void *value); + + unsigned int numArgs() const; /*!< \brief Number of arguments of this kernel */ + const Arg *arg(unsigned int index) const; /*!< \brief \c Arg at the given \p index */ + + /*! \brief \c Coal::DeviceKernel for the specified \p device */ + DeviceKernel *deviceDependentKernel(DeviceInterface *device) const; + llvm::Module *deviceDependentModule(DeviceInterface *device) const; + + bool argsSpecified() const; /*!< \brief true if all the arguments have been set through \c setArg() */ + bool hasLocals() const; /*!< \brief true if one or more argument is in file \c Arg::Local */ + + /** + * \brief Get information about this kernel + * \copydetails Coal::DeviceInterface::info + */ + cl_int info(cl_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + + /** + * \brief Get performance hints and device-specific data about this kernel + * \copydetails Coal::DeviceInterface::info + * \param device \c Coal::DeviceInterface on which the kernel will be run + */ + cl_int workGroupInfo(DeviceInterface *device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + boost::tuple<uint,uint,uint> reqdWorkGroupSize(llvm::Module *module) const; + + int get_wi_alloca_size() { return wi_alloca_size; } + + std::string p_name; + private: + bool p_has_locals; + int wi_alloca_size; + + struct DeviceDependent + { + DeviceInterface *device; + DeviceKernel *kernel; + llvm::Function *function; + llvm::Module *module; + }; + + std::vector<DeviceDependent> p_device_dependent; + std::vector<Arg *> p_args; + DeviceDependent null_dep; + + const DeviceDependent &deviceDependent(DeviceInterface *device) const; + DeviceDependent &deviceDependent(DeviceInterface *device); + +}; + +} + +struct _cl_kernel : public Coal::Kernel +{}; + +#endif diff --git a/src/core/memobject.cpp b/src/core/memobject.cpp new file mode 100644 index 0000000..5501ac1 --- /dev/null +++ b/src/core/memobject.cpp @@ -0,0 +1,960 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file memobject.cpp + * \brief Memory objects + */ + +#include "CL/cl_ext.h" +#include "memobject.h" +#include "context.h" +#include "deviceinterface.h" +#include "propertylist.h" +#include "events.h" + +#include <cstdlib> +#include <cstring> +#include <iostream> + +using namespace Coal; + +/* + * MemObject + */ + +MemObject::MemObject(Context *ctx, cl_mem_flags flags, void *host_ptr, + cl_int *errcode_ret) +: Object(Object::T_MemObject, ctx), p_num_devices(0), p_flags(flags), + p_host_ptr(host_ptr), p_devicebuffers(0), p_dtor_callback_stack() +{ + // Check the flags value + const cl_mem_flags all_flags = CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY | + CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR | + CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR + |CL_MEM_USE_MSMC_TI; + + if ((flags & CL_MEM_READ_WRITE) && (flags & CL_MEM_READ_ONLY)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + if ((flags & CL_MEM_READ_WRITE) && (flags & CL_MEM_WRITE_ONLY)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + if ((flags & CL_MEM_READ_ONLY) && (flags & CL_MEM_WRITE_ONLY)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if ((flags & ~all_flags) != 0) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR)) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check other values + if ((flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) != 0 && !host_ptr) + { + *errcode_ret = CL_INVALID_HOST_PTR; + return; + } + + if ((flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) == 0 && host_ptr) + { + *errcode_ret = CL_INVALID_HOST_PTR; + return; + } +} + +MemObject::~MemObject() +{ + while (!p_dtor_callback_stack.empty()) + { + dtor_callback_t callback; + if (p_dtor_callback_stack.pop(callback)) + callback.first((cl_mem)this, callback.second); + } + + if (p_devicebuffers) + { + // Also delete our children in the device + for (unsigned int i=0; i<p_num_devices; ++i) + delete p_devicebuffers[i]; + + std::free((void *)p_devicebuffers); + } +} + +cl_int MemObject::init() +{ + // Get the device list of the context + DeviceInterface **devices = 0; + cl_int rs; + + rs = ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES, + sizeof(unsigned int), + &p_num_devices, 0); + + if (rs != CL_SUCCESS) + return rs; + + p_devices_to_allocate = p_num_devices; + devices = (DeviceInterface **)std::malloc(p_num_devices * + sizeof(DeviceInterface *)); + + if (!devices) + return CL_OUT_OF_HOST_MEMORY; + + rs = ((Context *)parent())->info(CL_CONTEXT_DEVICES, + p_num_devices * sizeof(DeviceInterface *), + devices, 0); + + if (rs != CL_SUCCESS) + { + std::free((void *)devices); + return rs; + } + + // Allocate a table of DeviceBuffers + p_devicebuffers = (DeviceBuffer **)std::malloc(p_num_devices * + sizeof(DeviceBuffer *)); + + if (!p_devicebuffers) + { + std::free((void *)devices); + return CL_OUT_OF_HOST_MEMORY; + } + + // If we have more than one device, the allocation on the devices is + // defered to first use, so host_ptr can become invalid. So, copy it in + // a RAM location and keep it. Also, set a flag telling CPU devices that + // they don't need to reallocate and re-copy host_ptr + // SubBuffer should simply reuse Buffer data + if (p_num_devices > 1 && (p_flags & CL_MEM_COPY_HOST_PTR) + && type() != SubBuffer) + { + void *tmp_hostptr = std::malloc(size()); + + if (!tmp_hostptr) + { + std::free((void *)devices); + return CL_OUT_OF_HOST_MEMORY; + } + + std::memcpy(tmp_hostptr, p_host_ptr, size()); + + p_host_ptr = tmp_hostptr; + // Now, the client application can safely std::free() its host_ptr + } + + // Create a DeviceBuffer for each device + unsigned int failed_devices = 0; + + for (unsigned int i=0; i<p_num_devices; ++i) + { + DeviceInterface *device = devices[i]; + + rs = CL_SUCCESS; + p_devicebuffers[i] = device->createDeviceBuffer(this, &rs); + + if (rs != CL_SUCCESS) + { + p_devicebuffers[i] = 0; + failed_devices++; + } + } + + if (failed_devices == p_num_devices) + { + // Each device found a reason to reject the buffer, so it's invalid + std::free((void *)devices); + return rs; + } + + std::free((void *)devices); + devices = 0; + + // If we have only one device, already allocate the buffer + if (p_num_devices == 1) + { + if (!p_devicebuffers[0]->allocate()) + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + + return CL_SUCCESS; +} + +bool MemObject::allocate(DeviceInterface *device) +{ + DeviceBuffer *buffer = deviceBuffer(device); + + if (!buffer->allocated()) + { + return buffer->allocate(); + } + + return true; +} + +cl_mem_flags MemObject::flags() const +{ + return p_flags; +} + +void *MemObject::host_ptr() const +{ + if (type() != SubBuffer) + return p_host_ptr; + else + { + const class SubBuffer *subbuf = (const class SubBuffer *)this; + char *tmp = (char *)subbuf->parent()->host_ptr(); + + if (!tmp) return 0; + + tmp += subbuf->offset(); + + return (void *)tmp; + } +} + +DeviceBuffer *MemObject::deviceBuffer(DeviceInterface *device) const +{ + for (unsigned int i=0; i<p_num_devices; ++i) + { + if (p_devicebuffers[i]->device() == device) + return p_devicebuffers[i]; + } + + return 0; +} + +void MemObject::deviceAllocated(DeviceBuffer *buffer) +{ + (void) buffer; + + // Decrement the count of devices that must be allocated. If it becomes + // 0, it means we don't need to keep a copied host_ptr and that we can + // std::free() it. + p_devices_to_allocate--; + + if (p_devices_to_allocate == 0 && + p_num_devices > 1 && + (p_flags & CL_MEM_COPY_HOST_PTR)) + { + std::free(p_host_ptr); + p_host_ptr = 0; + } + +} + +void MemObject::setDestructorCallback(void (CL_CALLBACK *pfn_notify) + (cl_mem memobj, void *user_data), + void *user_data) +{ + p_dtor_callback_stack.push(dtor_callback_t(pfn_notify, user_data)); +} + +// HACK for the union +typedef void * void_p; + +cl_int MemObject::info(cl_mem_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + class SubBuffer *subbuf = (class SubBuffer *)this; + + union { + cl_mem_object_type cl_mem_object_type_var; + cl_mem_flags cl_mem_flags_var; + size_t size_t_var; + void_p void_p_var; + cl_uint cl_uint_var; + cl_context cl_context_var; + cl_mem cl_mem_var; + }; + + switch (param_name) + { + case CL_MEM_TYPE: + switch (type()) + { + case Buffer: + case SubBuffer: + cl_mem_object_type_var = CL_MEM_OBJECT_BUFFER; + break; + + case Image2D: + cl_mem_object_type_var = CL_MEM_OBJECT_IMAGE2D; + break; + + case Image3D: + cl_mem_object_type_var = CL_MEM_OBJECT_IMAGE3D; + break; + } + value = (void *)&cl_mem_object_type_var; + value_length = sizeof(cl_mem_object_type); + break; + + case CL_MEM_FLAGS: + SIMPLE_ASSIGN(cl_mem_flags, p_flags); + break; + + case CL_MEM_SIZE: + SIMPLE_ASSIGN(size_t, size()); + break; + + case CL_MEM_HOST_PTR: + SIMPLE_ASSIGN(void_p, host_ptr()); + break; + + case CL_MEM_MAP_COUNT: + SIMPLE_ASSIGN(cl_uint, 0); // TODO + break; + + case CL_MEM_REFERENCE_COUNT: + SIMPLE_ASSIGN(cl_uint, references()); + break; + + case CL_MEM_CONTEXT: + SIMPLE_ASSIGN(cl_context, parent()); + break; + + case CL_MEM_ASSOCIATED_MEMOBJECT: + if (type() != SubBuffer) + SIMPLE_ASSIGN(cl_mem, 0) + else + SIMPLE_ASSIGN(cl_mem, subbuf->parent()); + break; + + case CL_MEM_OFFSET: + if (type() != SubBuffer) + SIMPLE_ASSIGN(cl_mem, 0) + else + SIMPLE_ASSIGN(cl_mem, subbuf->offset()); + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +/* + * Buffer + */ + +Buffer::Buffer(Context *ctx, size_t size, void *host_ptr, cl_mem_flags flags, + cl_int *errcode_ret) +: MemObject(ctx, flags, host_ptr, errcode_ret), p_size(size) +{ + if (size == 0) + { + *errcode_ret = CL_INVALID_BUFFER_SIZE; + return; + } + +#if defined(__arm__) + if (size > 512*1024*1024) +#else + if (size > 1*1024*1024*1024) +#endif + { + *errcode_ret = CL_INVALID_BUFFER_SIZE; + return; + } + + // CL_MEM_READ_WRITE is default if not specified {READ,WRITE}_ONLY + if (! (flags & (CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY))) + p_flags |= CL_MEM_READ_WRITE; +} + +size_t Buffer::size() const +{ + return p_size; +} + +MemObject::Type Buffer::type() const +{ + return MemObject::Buffer; +} + +/*---------------------------------------------------------------------------- + * mapped_event: MapBufferEvent when the Map is on a Buffer + * RETURN: true if successful, false if fail + * Traverse currently mapped event list, check overlapping and if either is + * WRITE, insert into list in the increasing order of offset + * TODO: do we need to lock the list for operation??? + *---------------------------------------------------------------------------*/ +bool Buffer::addMapEvent(BufferEvent *mapped_event) +{ + MapBufferEvent *mbe = (MapBufferEvent *) mapped_event; + size_t mbe_offset = mbe->offset(); + if (mbe->buffer()->type() == SubBuffer) + mbe_offset += ((class SubBuffer *) mbe->buffer())->offset(); + + std::list<BufferEvent *>::iterator it, it_insert = p_mapped_events.end(); + for (it = p_mapped_events.begin(); it != p_mapped_events.end(); ++it) + { + MapBufferEvent *e = (MapBufferEvent *) (*it); + size_t e_offset = e->offset(); + if (e->buffer()->type() == SubBuffer) + e_offset += ((class SubBuffer *) e->buffer())->offset(); + if (mbe_offset < e_offset) it_insert = it; + + if ( mbe_offset <= e_offset + e->cb() - 1 + && e_offset <= mbe_offset + mbe->cb() - 1) + if ((mbe->flags() & CL_MAP_WRITE) || + (e->flags() & CL_MAP_WRITE)) + return false; + } + + p_mapped_events.insert(it_insert, mapped_event); + return true; +} + +/*---------------------------------------------------------------------------- + * mapped_ptr: mapped pointer from previous MapBuffer/MapImage Event + * RETURN: first MappedBufferEvent with same mapped_ptr in the list + * TODO: do we need to lock the list for operation??? + *---------------------------------------------------------------------------*/ +BufferEvent* Buffer::removeMapEvent(void *mapped_ptr) +{ + std::list<BufferEvent *>::iterator it; + for (it = p_mapped_events.begin(); it != p_mapped_events.end(); ++it) + { + MapBufferEvent *e = (MapBufferEvent *) (*it); + if (e->ptr() != mapped_ptr) continue; + p_mapped_events.erase(it); + return e; + } + return NULL; +} + +/* + * SubBuffer + */ + +SubBuffer::SubBuffer(class Buffer *parent, size_t offset, size_t size, + cl_mem_flags flags, cl_int *errcode_ret) +: MemObject((Context *)parent->parent(), flags, 0, errcode_ret), p_offset(offset), + p_size(size), p_parent(parent) +{ + clRetainMemObject((cl_mem) p_parent); + + if (size == 0) + { + *errcode_ret = CL_INVALID_BUFFER_SIZE; + return; + } + + if (offset + size > parent->size()) + { + *errcode_ret = CL_INVALID_BUFFER_SIZE; + return; + } + + // Check the compatibility of flags and parent->flags() + const cl_mem_flags wrong_flags = + CL_MEM_ALLOC_HOST_PTR | + CL_MEM_USE_HOST_PTR | + CL_MEM_COPY_HOST_PTR; + + if (flags & wrong_flags) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if ((parent->flags() & CL_MEM_WRITE_ONLY) && + (flags & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY))) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + if ((parent->flags() & CL_MEM_READ_ONLY) && + (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) + { + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // OpenCL 1.2: SubBuffer should inherit some of parent Buffer flags + cl_mem_flags parent_rw_flags = parent->flags() + & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY); + cl_mem_flags my_rw_flags = p_flags + & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY); + // parent be READ_WRITE, subBuffer be READ_ONLY/WRITE_ONLY (Spec allows) + if (! my_rw_flags) p_flags |= parent_rw_flags; + cl_mem_flags parent_hostptr_flags = parent->flags() + & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR); + if (parent_hostptr_flags) p_flags |= parent_hostptr_flags; +} + +SubBuffer::~SubBuffer() +{ + clReleaseMemObject((cl_mem) p_parent); +} + +size_t SubBuffer::size() const +{ + return p_size; +} + +MemObject::Type SubBuffer::type() const +{ + return MemObject::SubBuffer; +} + +bool SubBuffer::allocate(DeviceInterface *device) +{ + // SubBuffer always use Buffer's data + return p_parent->allocate(device); +} + +size_t SubBuffer::offset() const +{ + return p_offset; +} + +Buffer *SubBuffer::parent() const +{ + return p_parent; +} + +bool SubBuffer::addMapEvent(BufferEvent *mapped_event) +{ + return p_parent->addMapEvent(mapped_event); +} + +BufferEvent* SubBuffer::removeMapEvent(void *mapped_ptr) +{ + return p_parent->removeMapEvent(mapped_ptr); +} + +/* + * Image2D + */ + +Image2D::Image2D(Context *ctx, size_t width, size_t height, size_t row_pitch, + const cl_image_format *format, void *host_ptr, + cl_mem_flags flags, cl_int *errcode_ret) +: MemObject(ctx, flags, host_ptr, errcode_ret), + p_width(width), p_height(height), p_row_pitch(row_pitch) +{ + if (!width || !height) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + + if (!format) + { + *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; + return; + } + + p_format = *format; + + // Check format descriptor + switch (p_format.image_channel_data_type) + { + case CL_UNORM_INT_101010: + case CL_UNORM_SHORT_555: + case CL_UNORM_SHORT_565: + if (p_format.image_channel_order != CL_RGB || + p_format.image_channel_order != CL_RGBx) + { + *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; + return; + } + } + + switch (p_format.image_channel_order) + { + case CL_LUMINANCE: + case CL_INTENSITY: + switch (p_format.image_channel_data_type) + { + case CL_UNORM_INT8: + case CL_UNORM_INT16: + case CL_SNORM_INT8: + case CL_SNORM_INT16: + case CL_HALF_FLOAT: + case CL_FLOAT: + break; + default: + *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; + return; + } + break; + + case CL_RGB: + case CL_RGBx: + switch (p_format.image_channel_data_type) + { + case CL_UNORM_SHORT_555: + case CL_UNORM_SHORT_565: + case CL_UNORM_INT_101010: + break; + default: + *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; + return; + } + break; + + case CL_ARGB: + case CL_BGRA: + switch (p_format.image_channel_data_type) + { + case CL_UNORM_INT8: + case CL_SNORM_INT8: + case CL_SIGNED_INT8: + case CL_UNSIGNED_INT8: + break; + default: + *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; + return; + } + break; + } + + // Row pitch + p_row_pitch = width * pixel_size(p_format); + + if (row_pitch) + { + if (!host_ptr) + { + // row_pitch must be 0 if host_ptr is null + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + if (row_pitch < p_row_pitch) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + if (row_pitch % pixel_size(p_format) != 0) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + + p_row_pitch = row_pitch; + } +} + +size_t Image2D::size() const +{ + return height() * row_pitch(); +} + +MemObject::Type Image2D::type() const +{ + return MemObject::Image2D; +} + +size_t Image2D::width() const +{ + return p_width; +} + +size_t Image2D::height() const +{ + return p_height; +} + +size_t Image2D::row_pitch() const +{ + return p_row_pitch; +} + +size_t Image2D::slice_pitch() const +{ + // An Image2D is made of only one slice + return size(); +} + +const cl_image_format &Image2D::format() const +{ + return p_format; +} + +cl_int Image2D::imageInfo(cl_image_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + class Image3D *image3D = (class Image3D *)this; + + union { + cl_image_format cl_image_format_var; + size_t size_t_var; + }; + + switch (param_name) + { + case CL_IMAGE_FORMAT: + SIMPLE_ASSIGN(cl_image_format, format()); + break; + + case CL_IMAGE_ELEMENT_SIZE: + SIMPLE_ASSIGN(size_t, element_size(p_format)); + break; + + case CL_IMAGE_ROW_PITCH: + // TODO: What was given when the image was created or width*size ? + SIMPLE_ASSIGN(size_t, row_pitch()); + break; + + case CL_IMAGE_SLICE_PITCH: + if (type() == Image3D) + SIMPLE_ASSIGN(size_t, image3D->slice_pitch()) + else + SIMPLE_ASSIGN(size_t, 0); + break; + + case CL_IMAGE_WIDTH: + SIMPLE_ASSIGN(size_t, width()); + break; + + case CL_IMAGE_HEIGHT: + SIMPLE_ASSIGN(size_t, height()); + break; + + case CL_IMAGE_DEPTH: + if (type() == Image3D) + SIMPLE_ASSIGN(size_t, image3D->depth()) + else + SIMPLE_ASSIGN(size_t, 0); + break; + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +size_t Image2D::element_size(const cl_image_format &format) +{ + switch (format.image_channel_data_type) + { + case CL_SNORM_INT8: + case CL_UNORM_INT8: + case CL_SIGNED_INT8: + case CL_UNSIGNED_INT8: + return 1; + case CL_SNORM_INT16: + case CL_UNORM_INT16: + case CL_SIGNED_INT16: + case CL_UNSIGNED_INT16: + return 2; + case CL_SIGNED_INT32: + case CL_UNSIGNED_INT32: + return 4; + case CL_FLOAT: + return sizeof(float); + case CL_HALF_FLOAT: + return 2; + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + return 2; + case CL_UNORM_INT_101010: + return 4; + default: + return 0; + } +} + +unsigned int Image2D::channels(const cl_image_format &format) +{ + switch (format.image_channel_order) + { + case CL_R: + case CL_Rx: + case CL_A: + case CL_INTENSITY: + case CL_LUMINANCE: + return 1; + break; + + case CL_RG: + case CL_RGx: + case CL_RA: + return 2; + break; + + case CL_RGBA: + case CL_ARGB: + case CL_BGRA: + return 4; + break; + + case CL_RGBx: + case CL_RGB: + return 1; // Only special data types allowed (565, 555, etc) + break; + + default: + return 0; + } +} + +size_t Image2D::pixel_size(const cl_image_format &format) +{ + switch (format.image_channel_data_type) + { + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + return 2; + case CL_UNORM_INT_101010: + return 4; + default: + return channels(format) * element_size(format); + } +} + +size_t Image2D::element_size() const +{ + return element_size(p_format); +} + +size_t Image2D::pixel_size() const +{ + return pixel_size(p_format); +} + +unsigned int Image2D::channels() const +{ + return channels(p_format); +} + +/* + * Image3D + */ + +Image3D::Image3D(Context *ctx, size_t width, size_t height, size_t depth, + size_t row_pitch, size_t slice_pitch, + const cl_image_format *format, void *host_ptr, + cl_mem_flags flags, cl_int *errcode_ret) +: Image2D(ctx, width, height, row_pitch, format, host_ptr, flags, errcode_ret), + p_depth(depth) +{ + if (depth <= 1) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + + // Slice pitch + p_slice_pitch = height * this->row_pitch(); + + if (slice_pitch) + { + if (!host_ptr) + { + // slice_pitch must be 0 if host_ptr is null + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + if (slice_pitch < p_slice_pitch) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + if (slice_pitch % this->row_pitch() != 0) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + + p_slice_pitch = slice_pitch; + } +} + +size_t Image3D::size() const +{ + return depth() * slice_pitch(); +} + +MemObject::Type Image3D::type() const +{ + return MemObject::Image3D; +} + +size_t Image3D::depth() const +{ + return p_depth; +} + +size_t Image3D::slice_pitch() const +{ + return p_slice_pitch; +} diff --git a/src/core/memobject.h b/src/core/memobject.h new file mode 100644 index 0000000..82cbfab --- /dev/null +++ b/src/core/memobject.h @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file memobject.h + * \brief Memory objects + */ + +#ifndef __MEMOBJECT_H__ +#define __MEMOBJECT_H__ + +#include "object.h" +#include "dsp/u_concurrent_stack.h" + +#include <CL/cl.h> + +namespace Coal +{ + +class DeviceBuffer; +class Context; +class DeviceInterface; +class BufferEvent; + +/** + * \brief Base class for all the memory objects + */ +class MemObject : public Object +{ + public: + /** + * \brief Type of memory object + */ + enum Type + { + Buffer, + SubBuffer, + Image2D, + Image3D + }; + + /** + * \brief Constructor + * \param ctx parent \c Coal::Context + * \param flags memory object flags + * \param host_ptr host pointer used by some flags (see the OpenCL spec) + * \param errcode_ret return value + * \note Don't do any initialization here, but in \c init(). We only fill + * the private variables and check the values passed in argument. + * \sa init + */ + MemObject(Context *ctx, cl_mem_flags flags, void *host_ptr, + cl_int *errcode_ret); + virtual ~MemObject(); + + /** + * \brief Initialize the memory object + * + * Memory objects are device-independent classes. This function creates + * one \c Coal::DeviceBuffer per device present in the context by + * calling \c Coal::DeviceInterface::createDeviceBuffer(). + * + * If there is only one device, its \c Coal::DeviceBuffer is directly + * allocated. If there are more than one device, the allocation is + * deferred until a \c Coal::Event is pushed for this device. + * + * \return \c CL_SUCCESS if success, an error code otherwise + */ + virtual cl_int init(); + virtual bool allocate(DeviceInterface *device); /*!< \brief Allocate this memory object on the given \p device */ + virtual size_t size() const = 0; /*!< \brief Device-independent size of the memory object */ + virtual Type type() const = 0; /*!< \brief Type of the memory object */ + + cl_mem_flags flags() const; /*!< \brief Flags */ + void *host_ptr() const; /*!< \brief Host pointer */ + DeviceBuffer *deviceBuffer(DeviceInterface *device) const; /*!< \brief \c Coal::DeviceBuffer for the given \p device */ + + void deviceAllocated(DeviceBuffer *buffer); /*!< \brief Is the \c Coal::DeviceBuffer for \p buffer allocated ? */ + + /** + * \brief Set a destructor callback for this memory object + * + * This callback is called when this memory object is deleted. It is + * currently called from the destructor, so the memory object is already + * invalid, but as OpenCL objects are immutable, the callback cannot + * use its \c memobj parameter except in a pointer comparison, and there + * is no problem. + * + * \param pfn_notify function to call when the memory object is deleted + * \param user_data user data to pass to this function + */ + void setDestructorCallback(void (CL_CALLBACK *pfn_notify)(cl_mem memobj, + void *user_data), + void *user_data); + + /** + * \brief Get information about this memory object + * \copydetails Coal::DeviceInterface::info + */ + cl_int info(cl_mem_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + virtual bool addMapEvent(BufferEvent *mapped_event) { return false; } + virtual BufferEvent* removeMapEvent(void *mapped_ptr) { return NULL; } + + protected: + cl_mem_flags p_flags; + std::list<BufferEvent *> p_mapped_events; + + private: + unsigned int p_num_devices, p_devices_to_allocate; + void *p_host_ptr; + DeviceBuffer **p_devicebuffers; + + typedef std::pair<void (CL_CALLBACK *)(cl_mem memobj, void *user_data), void*> dtor_callback_t; + concurrent_stack<dtor_callback_t> p_dtor_callback_stack; + + //void (CL_CALLBACK *p_dtor_callback)(cl_mem memobj, void *user_data); + //void *p_dtor_userdata; +}; + +/** + * \brief Simple buffer object + */ +class Buffer : public MemObject +{ + public: + /** + * \brief Constructor + * \param ctx parent \c Coal::Context + * \param size size of the buffer, in bytes + * \param host_ptr host pointer + * \param flags memory flags + * \param errcode_ret return code + */ + Buffer(Context *ctx, size_t size, void *host_ptr, cl_mem_flags flags, + cl_int *errcode_ret); + + size_t size() const; /*!< \brief Size of the buffer, in bytes */ + Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Buffer */ + + bool addMapEvent(BufferEvent *mapped_event); + BufferEvent* removeMapEvent(void *mapped_ptr); + private: + size_t p_size; + +}; + +/** + * \brief Sub-buffer + */ +class SubBuffer : public MemObject +{ + public: + /** + * \brief Constructor + * \param parent parent \c Coal::Buffer + * \param offset offset in \p parent of the start of this sub-buffer + * \param size size of the sub-buffer + * \param flags memory flags (must be compatible with the \p parent's ones) + * \param errcode_ret return code + */ + SubBuffer(class Buffer *parent, size_t offset, size_t size, + cl_mem_flags flags, cl_int *errcode_ret); + ~SubBuffer(); + + size_t size() const; /*!< \brief Size */ + Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::SubBuffer */ + bool allocate(DeviceInterface *device); /*!< \brief Allocate the \b parent \c Coal::Buffer */ + + size_t offset() const; /*!< \brief Offset in bytes */ + class Buffer *parent() const; /*!< \brief Parent \c Coal::Buffer */ + + bool addMapEvent(BufferEvent *mapped_event); + BufferEvent* removeMapEvent(void *mapped_ptr); + private: + size_t p_offset, p_size; + class Buffer *p_parent; +}; + +/** + * \brief 2D image + */ +class Image2D : public MemObject +{ + public: + /** + * \brief Constructor + * \param ctx parent \c Coal::Context + * \param width width of the image + * \param height height of the image + * \param row_pitch number of bytes in a row of pixels. If 0, defaults to <tt>width * pixel_size()</tt> + * \param format image format + * \param host_ptr host pointer + * \param flags memory flags + * \param errcode_ret return code + */ + Image2D(Context *ctx, size_t width, size_t height, size_t row_pitch, + const cl_image_format *format, void *host_ptr, + cl_mem_flags flags, cl_int *errcode_ret); + + virtual size_t size() const; /*!< \brief Size in bytes */ + virtual Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Image2D */ + + size_t width() const; /*!< \brief Width */ + size_t height() const; /*!< \brief Height */ + size_t row_pitch() const; /*!< \brief Size in bytes of a row of pixels */ + virtual size_t slice_pitch() const; /*!< \brief Size in bytes of the image */ + const cl_image_format &format() const; /*!< \brief Image format descriptor */ + + /** + * \brief Information about this image object + * + * This function is also usable for \c Coal::Image3D objects as it does + * casting when necessary in order to give information when needed. + * + * \copydetails Coal::DeviceInterface::info + */ + cl_int imageInfo(cl_image_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + static size_t element_size(const cl_image_format &format); /*!< \brief Size in bytes of each channel of \p format */ + static unsigned int channels(const cl_image_format &format);/*!< \brief Number of channels of \p format */ + static size_t pixel_size(const cl_image_format &format); /*!< \brief Size in bytes of a pixel in \p format */ + size_t pixel_size() const; /*!< \brief Pixel size of this image */ + size_t element_size() const; /*!< \brief Channel size of this image */ + unsigned int channels() const; /*!< \brief Number of channels of this image */ + + private: + size_t p_width, p_height, p_row_pitch; + cl_image_format p_format; +}; + +/** + * \brief 3D image + */ +class Image3D : public Image2D +{ + public: + /** + * \brief Constructor + * \param ctx parent \c Coal::Context + * \param width width of the image + * \param height height of the image + * \param depth depth of the image + * \param row_pitch number of bytes in a row of pixels. If 0, defaults to <tt>width * pixel_size()</tt> + * \param slice_pitch number of bytes in a 2D slice. If 0, defaults to <tt>height * row_pitch()</tt> + * \param format image format + * \param host_ptr host pointer + * \param flags memory flags + * \param errcode_ret return code + */ + Image3D(Context *ctx, size_t width, size_t height, size_t depth, + size_t row_pitch, size_t slice_pitch, + const cl_image_format *format, void *host_ptr, + cl_mem_flags flags, cl_int *errcode_ret); + + size_t size() const; /*!< \brief Size in bytes of this image */ + Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Image3D */ + + size_t depth() const; /*!< \brief Depth of the image */ + size_t slice_pitch() const; /*!< \brief Size in bytes of a 2D slice */ + + private: + size_t p_depth, p_slice_pitch; +}; + +} + +struct _cl_mem : public Coal::MemObject +{}; + +#endif diff --git a/src/core/object.cpp b/src/core/object.cpp new file mode 100644 index 0000000..be44279 --- /dev/null +++ b/src/core/object.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file object.cpp + * \brief Reference-counted object tree + */ + +#include "object.h" + +using namespace Coal; + +static std::list<Object *>& getKnownObjects() +{ + static std::list<Object *> known_objects; + return known_objects; +} + + +Object::Object(Type type, Object *parent) +: p_references(1), p_parent(parent), p_type(type), p_release_parent(true) +{ + if (parent) + parent->reference(); + + // Add object in the list of known objects + getKnownObjects().push_front(this); + p_it = getKnownObjects().begin(); +} + +Object::~Object() +{ + if (p_parent && p_parent->dereference() && p_release_parent) + delete p_parent; + + // Remove object from the list of known objects + getKnownObjects().erase(p_it); +} + +void Object::reference() +{ + p_references++; +} + +bool Object::dereference() +{ + p_references--; + return (p_references == 0); +} + +void Object::setReleaseParent (bool release) +{ + p_release_parent = release; +} + +unsigned int Object::references() const +{ + return p_references; +} + +Object *Object::parent() const +{ + return p_parent; +} + +Object::Type Object::type() const +{ + return p_type; +} + +bool Object::isA(Object::Type type) const +{ + // Check for null values + if (this == 0) + return false; + + // Check that the value isn't garbage or freed pointer + std::list<Object *>::const_iterator it = getKnownObjects().begin(), + e = getKnownObjects().end(); + while (it != e) + { + if (*it == this) + // OK, NOW it is safe to dereference this ptr: + return this->type() == type; + + ++it; + } + + return false; +} diff --git a/src/core/object.h b/src/core/object.h new file mode 100644 index 0000000..d83e326 --- /dev/null +++ b/src/core/object.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file object.h + * \brief Object tree + */ + +#ifndef __REFCOUNTED_H__ +#define __REFCOUNTED_H__ + +#include <list> + +namespace Coal +{ + +/** + * \brief Base class of all the Clover objects + * + * This class implements functions needed by all the Clover objects, like + * reference counting, the object tree (parents/children), etc. + * + * It also uses a special list of known objects, used to check that a pointer + * passed by the user to an OpenCL function actually is an object of the correct + * type. See \c isA(). + */ +class Object +{ + public: + /** + * \brief Type of object the inherited class actually is + */ + enum Type + { + T_Device, /*!< \brief \c Coal::DeviceInterface */ + T_CommandQueue, /*!< \brief \c Coal::CommandQueue */ + T_Event, /*!< \brief \c Coal::Event */ + T_Context, /*!< \brief \c Coal::Context */ + T_Kernel, /*!< \brief \c Coal::Kernel */ + T_MemObject, /*!< \brief \c Coal::MemObject */ + T_Program, /*!< \brief \c Coal::Program */ + T_Sampler /*!< \brief \c Coal::Sampler */ + }; + + /** + * \brief Constructor + * \param type type of the child class calling this constructor + * \param parent parent object + */ + Object(Type type, Object *parent = 0); + virtual ~Object(); + + /** + * \brief Increments the reference counter + */ + void reference(); + + /** + * \brief Decrements the reference counter + * \return true if the reference counter has reached 0 + */ + bool dereference(); + + /** + * \brief Reference counter + * \return the number of references of this class currently in use + */ + unsigned int references() const; + + /** + * \brief Set if the parent object has to be deleted if its reference count reaches 0 + * + * The destructor of \c Coal::Object dereferences its parent object. + * This is done in order to correctly free objects when no object has + * a reference to it anymore. + * + * Some objects such as \c Coal::CommandQueue need to do some operations + * before being deleted. This function tells \c Coal::Object to + * dereference its parent object, but not to call \b delete on it. + * + * \param release true to have \b delete called on the parent object + * when its reference count reaches 0, false to keep it + */ + void setReleaseParent(bool release); + + Object *parent() const; /*!< \brief Parent object */ + Type type() const; /*!< \brief Type */ + + /** + * \brief Returns whether this object is an instance of \p type + * \note This function begins with a NULL-check on the \c this pointer, + * so it's safe to use even when \c this is not guaranteed not to + * be NULL. + * \param type type this object must have for the check to pass + * \return true if this object exists and has the correct type + */ + bool isA(Type type) const; + + private: + unsigned int p_references; + Object *p_parent; + Type p_type; + std::list<Object *>::iterator p_it; + bool p_release_parent; +}; + +} + +#endif diff --git a/src/core/platform.cpp b/src/core/platform.cpp new file mode 100644 index 0000000..1af6153 --- /dev/null +++ b/src/core/platform.cpp @@ -0,0 +1,227 @@ +/****************************************************************************** + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include <list> +#include <iostream> + +#include "CL/cl.h" +#include "CL/cl_ext.h" +#include "platform.h" +#include "propertylist.h" +#include "object.h" +#include "cpu/device.h" +#ifndef SHAMROCK_BUILD +#include "dsp/device.h" +#include "dsp/driver.h" +#endif + +/*----------------------------------------------------------------------------- +* For the lock file +*----------------------------------------------------------------------------*/ +#include <sys/file.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> + +using namespace Coal; + +/****************************************************************************** +* begin_file_lock_crit_section +******************************************************************************/ +static int begin_file_lock_crit_section(char* fname) +{ + /*--------------------------------------------------------------------- + * Create a lock, so only 1 OpenCL program can progress at a time. + * I'm not sure about the appropriateness of putting this in the ctor. + * We may look at delayed ctor of platform with this in it. + *--------------------------------------------------------------------*/ + int lock_fd = open(fname, O_CREAT, + S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH); + + std::string str_fname(fname); + + if (lock_fd < 0) + { + std::cout << "Can not open lock file " << str_fname << ", Aborting !" << std::endl; + exit(-1); + } + + int res = flock(lock_fd, LOCK_EX|LOCK_NB); + if (res == -1) + { + if (errno == EWOULDBLOCK) + { + std::cout << "Waiting on lock " << str_fname << " ..." << std::endl; + res = flock(lock_fd, LOCK_EX); + if (res == -1) + { + std::cout << "Error Locking file " << str_fname << ", Aborting !" << std::endl; + exit(-1); + } + else std::cout << "Acquired lock " << str_fname << ", Proceeding!" << std::endl; + } + else + { + std::cout << "Error Locking file " << str_fname << ", Aborting !" << std::endl; + exit(-1); + } + } + + return lock_fd; + +} + +namespace Coal +{ + Platform::Platform() : dispatch(&dispatch_table) + { + char filename[] = "/var/lock/opencl"; + p_lock_fd = begin_file_lock_crit_section(filename); + + p_devices.push_back((_cl_device_id*)new Coal::CPUDevice); + + // Driver class only exists for the DSPDevice, so need this guard: +#ifndef SHAMROCK_BUILD + for (int i = 0; i < Driver::instance()->num_dsps(); i++) + p_devices.push_back((_cl_device_id*)new Coal::DSPDevice(i)); +#endif + } + + Platform::~Platform() + { + flock(p_lock_fd, LOCK_UN); + close(p_lock_fd); + + for (int i = 0; i < p_devices.size(); i++) + delete p_devices[i]; + } + + cl_uint Platform::getDevices(cl_device_type device_type, + cl_uint num_entries, cl_device_id * devices) + { + cl_uint device_number = 0; + + if (device_type == CL_DEVICE_TYPE_DEFAULT) +#ifdef SHAMROCK_BUILD + device_type = CL_DEVICE_TYPE_CPU; +#else + device_type = CL_DEVICE_TYPE_ACCELERATOR; +#endif + + for (int d = 0; d < p_devices.size(); d++) + { + cl_device_type type; + p_devices[d]->info(CL_DEVICE_TYPE, sizeof(cl_device_type), &type,0); + + if (type & device_type) + { + if (devices && device_number < num_entries) + devices[device_number++] = p_devices[d]; + else device_number++; + } + } + + return device_number; + } + + cl_int Platform::info(cl_mem_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const + { + void *value = 0; + size_t value_length = 0; + + switch (param_name) + { + case CL_PLATFORM_PROFILE: + STRING_ASSIGN("FULL_PROFILE"); + break; + + case CL_PLATFORM_VERSION: +#ifdef SHAMROCK_BUILD + STRING_ASSIGN("OpenCL 1.1 Shamrock "); +#else + STRING_ASSIGN("OpenCL 1.1 TI "); +#endif + break; + + case CL_PLATFORM_NAME: +#ifdef SHAMROCK_BUILD + STRING_ASSIGN("Shamrock OpenCL for Arm"); +#else +#if defined(__arm__) + STRING_ASSIGN("TI OpenCL for Arm + Dsp"); +#else + STRING_ASSIGN("TI OpenCL for Advantech DSPC868x"); +#endif +#endif + break; + + case CL_PLATFORM_VENDOR: +#ifdef SHAMROCK_BUILD + STRING_ASSIGN("Open Source Software"); +#else + STRING_ASSIGN("Texas Instruments, Inc."); +#endif + break; + + case CL_PLATFORM_EXTENSIONS: + // TODO add cl_khr_icd when it works +#ifdef SHAMROCK_BUILD + STRING_ASSIGN("cl_khr_byte_addressable_store cl_khr_fp64"); +#else + STRING_ASSIGN("cl_khr_byte_addressable_store cl_khr_fp64 cl_ti_msmc_buffers"); +#endif + break; + + case CL_PLATFORM_ICD_SUFFIX_KHR: +#ifndef SHAMROCK_BUILD + STRING_ASSIGN("TI"); +#endif + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; + } +}; + +_cl_platform_id the_platform; diff --git a/src/core/platform.h b/src/core/platform.h new file mode 100644 index 0000000..809d12c --- /dev/null +++ b/src/core/platform.h @@ -0,0 +1,65 @@ +/****************************************************************************** + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#ifndef __PLATFORM_H__ +#define __PLATFORM_H__ + +#include <CL/cl.h> +#include <vector> +#include <cstring> +#include "icd.h" + +namespace Coal +{ + +class Platform +{ + public: + Platform(); + ~Platform(); + + cl_uint getDevices(cl_device_type device_type, + cl_uint num_entries, cl_device_id * devices); + + cl_int info(cl_platform_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + private: + KHRicdVendorDispatch *dispatch; + std::vector <cl_device_id> p_devices; + int p_lock_fd; +}; + +} + +struct _cl_platform_id : public Coal::Platform +{}; + +extern _cl_platform_id the_platform; +#endif diff --git a/src/core/program.cpp b/src/core/program.cpp new file mode 100644 index 0000000..5f6e99f --- /dev/null +++ b/src/core/program.cpp @@ -0,0 +1,846 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file core/program.cpp + * \brief Program + */ + +#include "program.h" +#include "context.h" +#include "compiler.h" +#include "kernel.h" +#include "propertylist.h" +#include "deviceinterface.h" + +#include <string> +#include <cstring> +#include <cstdlib> +#include <iostream> +#include <vector> +#include <set> +#include <algorithm> + +#include <llvm/ADT/StringRef.h> +#include <llvm/ADT/SmallVector.h> +#include <llvm/Support/MemoryBuffer.h> +#include <llvm/Support/raw_ostream.h> +#include <llvm/Support/Casting.h> +#include <llvm/Support/ErrorOr.h> +#include <llvm/Bitcode/ReaderWriter.h> +#include <llvm/Transforms/IPO.h> +#include <llvm/IR/LLVMContext.h> +#include <llvm/IR/Module.h> +#include <llvm/Linker/Linker.h> +#include <llvm/PassManager.h> +#include <llvm/IR/Metadata.h> +#include <llvm/IR/Function.h> +#include <llvm/Analysis/Passes.h> +#include <llvm/Transforms/IPO.h> +#include <llvm/IR/Instructions.h> +#include <llvm/IR/InstIterator.h> + +#include <runtime/stdlib.c.bc.embed.h> + + +/*----------------------------------------------------------------------------- +* temporary for source file cacheing, remove from product releases +*----------------------------------------------------------------------------*/ +//#include "dsp/source_cache.h" +//source_cache * source_cache::pInstance = 0; + +using namespace Coal; +using namespace llvm; + +Program::Program(Context *ctx) +: Object(Object::T_Program, ctx), p_type(Invalid), p_state(Empty) +{ + p_null_device_dependent.compiler = 0; + p_null_device_dependent.device = 0; + p_null_device_dependent.linked_module = 0; + p_null_device_dependent.program = 0; +} + +Program::~Program() +{ + resetDeviceDependent(); +} + +void Program::resetDeviceDependent() +{ + while (p_device_dependent.size()) + { + DeviceDependent &dep = p_device_dependent.back(); + + delete dep.compiler; + delete dep.program; + delete dep.linked_module; + + p_device_dependent.pop_back(); + } +} + +void Program::setDevices(cl_uint num_devices, DeviceInterface * const*devices) +{ + p_device_dependent.resize(num_devices); + + for (cl_uint i=0; i<num_devices; ++i) + { + DeviceDependent &dep = p_device_dependent[i]; + + dep.device = devices[i]; + dep.program = dep.device->createDeviceProgram(this); + dep.is_native_binary = false; + dep.linked_module = 0; + dep.compiler = new Compiler(dep.device); + } +} + +Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device) +{ + for (size_t i=0; i<p_device_dependent.size(); ++i) + { + DeviceDependent &rs = p_device_dependent[i]; + + if (rs.device == device || (!device && p_device_dependent.size() == 1)) + return rs; + } + + return p_null_device_dependent; +} + +const Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device) const +{ + for (size_t i=0; i<p_device_dependent.size(); ++i) + { + const DeviceDependent &rs = p_device_dependent[i]; + + if (rs.device == device || (!device && p_device_dependent.size() == 1)) + return rs; + } + + return p_null_device_dependent; +} + +DeviceProgram *Program::deviceDependentProgram(DeviceInterface *device) const +{ + const DeviceDependent &dep = deviceDependent(device); + + return dep.program; +} + +std::string Program::deviceDependentCompilerOptions(DeviceInterface *device) const +{ + const DeviceDependent &dep = deviceDependent(device); + + return dep.compiler->options(); +} + +std::vector<llvm::Function *> Program::kernelFunctions(DeviceDependent &dep) +{ + std::vector<llvm::Function *> rs; + + llvm::NamedMDNode *kernels = + dep.linked_module->getNamedMetadata("opencl.kernels"); + + if (!kernels) return rs; + + for (unsigned int i=0; i<kernels->getNumOperands(); ++i) + { + llvm::MDNode *node = kernels->getOperand(i); + + /*--------------------------------------------------------------------- + * Each node has only one operand : a llvm::Function + *--------------------------------------------------------------------*/ + llvm::Value *value = node->getOperand(0); + + /*--------------------------------------------------------------------- + * Bug somewhere, don't crash + *--------------------------------------------------------------------*/ + if (!llvm::isa<llvm::Function>(value)) continue; + + llvm::Function *f = llvm::cast<llvm::Function>(value); + rs.push_back(f); + } + + return rs; +} + +/****************************************************************************** +* Kernel *Program::createKernel(const std::string &name, cl_int *errcode_ret) +******************************************************************************/ +Kernel *Program::createKernel(const std::string &name, cl_int *errcode_ret) +{ + Kernel *rs = NULL; + + for (size_t i=0; i < kernelList.size(); i++) + { + if (kernelList[i]->p_name.compare(name) == 0) + { + *errcode_ret = CL_SUCCESS; + return kernelList[i]; + } + } + /* Now check the previously released list */ + for (size_t i=0; i < kernelReleasedList.size(); i++) + { + if (kernelReleasedList[i]->p_name.compare(name) == 0) + { + *errcode_ret = CL_SUCCESS; + rs = kernelReleasedList[i]; + kernelReleasedList.erase(kernelReleasedList.begin() + i); + kernelList.push_back(rs); + + return rs; + } + } + + rs = new Kernel(this); + + /*------------------------------------------------------------------------- + * Add a function definition for each device + *------------------------------------------------------------------------*/ + for (size_t i=0; i < p_device_dependent.size(); ++i) + { + bool found = false; + DeviceDependent &dep = p_device_dependent[i]; + const std::vector<llvm::Function *> &kernels = kernelFunctions(dep); + + /*--------------------------------------------------------------------- + * Find the one with the good name + *--------------------------------------------------------------------*/ + for (size_t j=0; j < kernels.size(); ++j) + { + llvm::Function *func = kernels[j]; + + if (func->getName().str().compare(name) == 0) + { + found = true; + *errcode_ret = rs->addFunction(dep.device, func, + dep.linked_module); + if (*errcode_ret != CL_SUCCESS) return rs; + break; + } + } + + /*--------------------------------------------------------------------- + * Kernel unavailable for this device + *--------------------------------------------------------------------*/ + if (!found) + { + *errcode_ret = CL_INVALID_KERNEL_NAME; + return rs; + } + else + { + kernelList.push_back(rs); + } + } + + return rs; +} + +Kernel * Program::createKernelsAndReturnKernel(const std::string &name, cl_int *errcode_ret) +{ + Kernel *rs = NULL; + /*------------------------------------------------------------------------- + * We should never go here + *------------------------------------------------------------------------*/ + if (p_device_dependent.size() == 0) return rs; + + + for (size_t i=0; i < kernelList.size(); i++) + { + if (kernelList[i]->p_name.compare(name) == 0) + { + *errcode_ret = CL_SUCCESS; + return kernelList[i]; + } + } + /* Now check the previously released list */ + for (size_t i=0; i < kernelReleasedList.size(); i++) + { + if (kernelReleasedList[i]->p_name.compare(name) == 0) + { + *errcode_ret = CL_SUCCESS; + rs = kernelReleasedList[i]; + kernelReleasedList.erase(kernelReleasedList.begin() + i); + kernelList.push_back(rs); + + return rs; + } + } + + /*------------------------------------------------------------------------- + * Take the list of kernels for the first device dependent + *------------------------------------------------------------------------*/ + DeviceDependent &dep = p_device_dependent[0]; + const std::vector<llvm::Function *> &kernels = kernelFunctions(dep); + + /*------------------------------------------------------------------------- + * Create the kernel for each function name + * It returns an error if the signature is not the same for every device + * or if the kernel isn't found on all the devices. + *------------------------------------------------------------------------*/ + *errcode_ret = CL_SUCCESS; + + for (size_t i=0; i < kernels.size(); ++i) + { + cl_int result = CL_SUCCESS; + Kernel *kernel = createKernel(kernels[i]->getName().str(), &result); + + if (result == CL_SUCCESS) + { + } + else + { + *errcode_ret = result; + delete kernel; + } + if (kernel->p_name.compare(name) == 0 && result == CL_SUCCESS) + { + rs = kernel; + *errcode_ret = result; + } + } + + if (!rs && (*errcode_ret == CL_SUCCESS)) + *errcode_ret = CL_INVALID_KERNEL_NAME; + + return rs; +} + +std::vector<Kernel *> Program::createKernels(cl_int *errcode_ret) +{ + std::vector<Kernel *> rs; + Kernel *kern = NULL; + + /*------------------------------------------------------------------------- + * We should never go here + *------------------------------------------------------------------------*/ + if (p_device_dependent.size() == 0) return rs; + + /* + * Resurrect any released kernels back to the kernel list. This handles the + * case where clCreateKernelsInProgram() is asking only for a count of kernels in + * the currently built program. In that case, KernelList.size() must be the actual + * number of kernels compiled into the program (event if they were previously released). + */ + for (size_t i=0; i < kernelReleasedList.size(); i++) + { + kern = kernelReleasedList[i]; + kernelReleasedList.erase(kernelReleasedList.begin() + i); + kernelList.push_back(kern); + } + + if (kernelList.size()) return kernelList; + + /*------------------------------------------------------------------------- + * Take the list of kernels for the first device dependent + *------------------------------------------------------------------------*/ + DeviceDependent &dep = p_device_dependent[0]; + const std::vector<llvm::Function *> &kernels = kernelFunctions(dep); + + /*------------------------------------------------------------------------- + * Create the kernel for each function name + * It returns an error if the signature is not the same for every device + * or if the kernel isn't found on all the devices. + *------------------------------------------------------------------------*/ + for (size_t i=0; i < kernels.size(); ++i) + { + cl_int result = CL_SUCCESS; + Kernel *kernel = createKernel(kernels[i]->getName().str(), &result); + + if (result == CL_SUCCESS) + { + kernelList.push_back(kernel); + } + else + { + *errcode_ret = result; + delete kernel; + } + } + + return kernelList; +} + +cl_int Program::loadSources(cl_uint count, const char **strings, + const size_t *lengths) +{ + // Initialize + p_source = std::string(""); + + // Merge all strings into one big one + for (cl_uint i=0; i<count; ++i) + { + size_t len = 0; + const char *data = strings[i]; + + if (!data) + return CL_INVALID_VALUE; + + // Get the length of the source + if (lengths && lengths[i]) + len = lengths[i]; + else + len = std::strlen(data); + + // Remove trailing \0's, it's not good for sources (it can arise when + // the client application wrongly sets lengths + while (len > 0 && data[len-1] == 0) + len--; + + // Merge the string + std::string part(data, len); + p_source += part; + } + + /*------------------------------------------------------------------------- + * temporary for source file cacheing, remove from product releases + *------------------------------------------------------------------------*/ + //source_cache::instance()->remember(p_source); + + p_type = Source; + p_state = Loaded; + + return CL_SUCCESS; +} + +cl_int Program::loadBinaries(const unsigned char **data, const size_t *lengths, + cl_int *binary_status, cl_uint num_devices, + DeviceInterface * const*device_list) +{ + // Set device infos + setDevices(num_devices, device_list); + + // Load the data + for (cl_uint i=0; i<num_devices; ++i) + { + DeviceDependent &dep = deviceDependent(device_list[i]); + dep.unlinked_binary = std::string((const char *)data[i], lengths[i]); + dep.is_native_binary = true; + + /*-------------------------------------------------------------------- + * Loaded binary is either native code with LLVM bitcode embedded, + * or LLVM bitcode itself + *--------------------------------------------------------------------*/ + std::string bitcode; + if (! dep.program->ExtractMixedBinary(&dep.unlinked_binary, &bitcode, + NULL)) + { + bitcode = dep.unlinked_binary; + dep.is_native_binary = false; + } + + const llvm::StringRef s_data(bitcode); + const llvm::StringRef s_name("<binary>"); + + llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer( + s_data, s_name, false); + + if (!buffer) + return CL_OUT_OF_HOST_MEMORY; + + // Make a module of it + ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(buffer, + llvm::getGlobalContext()); + if (ModuleOrErr) { + dep.linked_module = ModuleOrErr.get(); + } + else { + dep.linked_module = NULL; + if (binary_status) binary_status[i] = CL_INVALID_VALUE; + return CL_INVALID_BINARY; + } + + if (binary_status) binary_status[i] = CL_SUCCESS; + } + + p_type = Binary; + p_state = Loaded; + + return CL_SUCCESS; +} + +cl_int Program::build(const char *options, + void (CL_CALLBACK *pfn_notify)(cl_program program, + void *user_data), + void *user_data, cl_uint num_devices, + DeviceInterface * const*device_list) +{ + // If we've already built this program and are re-building + // (for example, with different user options) then clear out the + // device dependent information in preparation for building again. + if( p_state == Built) resetDeviceDependent(); + + p_state = Failed; + + // Set device infos + if (!p_device_dependent.size()) + { + setDevices(num_devices, device_list); + } + + // ASW TODO - optimize to compile for each device type only once. + for (cl_uint i=0; i<p_device_dependent.size(); ++i) + { + DeviceDependent &dep = deviceDependent(device_list[i]); + + // Do we need to compile the source for each device ? + if (p_type == Source) + { + // Load source + const llvm::StringRef s_data(p_source); + const llvm::StringRef s_name("<source>"); + + llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer( + s_data, s_name); + + // Compile + int compile_result = dep.compiler->compile(options ? options : std::string(), buffer); + if (compile_result) + //if (! dep.compiler->compile(options ? options : std::string(), + // buffer) ) + { + if (pfn_notify) + pfn_notify((cl_program)this, user_data); + if (compile_result == CL_INVALID_BUILD_OPTIONS) + return CL_INVALID_BUILD_OPTIONS; + else + return CL_BUILD_PROGRAM_FAILURE; + } + + // Get module and its bitcode + dep.linked_module = dep.compiler->module(); + + llvm::raw_string_ostream ostream(dep.unlinked_binary); + llvm::WriteBitcodeToFile(dep.linked_module, ostream); + ostream.flush(); + } + + // Link p_linked_module with the stdlib if the device needs that + if (! dep.is_native_binary && dep.program->linkStdLib()) + { + // Load the stdlib bitcode + const llvm::StringRef s_data(embed_stdlib_c_bc, + sizeof(embed_stdlib_c_bc) - 1); + const llvm::StringRef s_name("stdlib.bc"); + std::string errMsg; + + llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer( + s_data, s_name, false); + + if (!buffer) + return CL_OUT_OF_HOST_MEMORY; + + ErrorOr<Module *> ModuleOrErr = + parseBitcodeFile(buffer, llvm::getGlobalContext()); + Module *stdlib = NULL; + if (ModuleOrErr) { + stdlib = ModuleOrErr.get(); + } + else { + std::error_code EC = ModuleOrErr.getError(); + errMsg = EC.message(); + } + + // Link + if (!stdlib || + llvm::Linker::LinkModules(dep.linked_module, stdlib, + llvm::Linker::DestroySource, &errMsg)) + { + dep.compiler->appendLog("link error: "); + dep.compiler->appendLog(errMsg); + dep.compiler->appendLog("\n"); + + // DEBUG + std::cout << dep.compiler->log() << std::endl; + + if (pfn_notify) + pfn_notify((cl_program)this, user_data); + + return CL_BUILD_PROGRAM_FAILURE; + } + } + + if (! dep.is_native_binary) + { + // Get list of kernels to strip other unused functions + std::vector<const char *> api; + std::vector<std::string> api_s; // Needed to keep valid data in api + const std::vector<llvm::Function *> &kernels = kernelFunctions(dep); + + for (size_t j=0; j<kernels.size(); ++j) + { + std::string s = kernels[j]->getName().str(); + api_s.push_back(s); + api.push_back(s.c_str()); + } + + // determine if module has barrier() function calls + bool hasBarrier = false; + llvm::CallInst* call; + for (llvm::Module::iterator F = dep.linked_module->begin(), + EF = dep.linked_module->end(); !hasBarrier && F != EF; ++F) + for (llvm::inst_iterator I = inst_begin(*F), + E = inst_end(*F); I != E; ++I) + { + if (!(call = llvm::dyn_cast<llvm::CallInst>(&*I))) continue; + if (!call->getCalledFunction()) continue; + std::string name(call->getCalledFunction()->getName()); + if (name == "barrier") + { + hasBarrier = true; + break; + } + } + + // Optimize code + llvm::PassManager *manager = new llvm::PassManager(); + + // Common passes (primary goal : remove unused stdlib functions) + manager->add(llvm::createTypeBasedAliasAnalysisPass()); + manager->add(llvm::createBasicAliasAnalysisPass()); + manager->add(llvm::createInternalizePass(api)); + manager->add(llvm::createIPSCCPPass()); + manager->add(llvm::createGlobalOptimizerPass()); + manager->add(llvm::createConstantMergePass()); + manager->add(llvm::createAlwaysInlinerPass()); + + dep.program->createOptimizationPasses(manager, + dep.compiler->optimize(), hasBarrier); + + manager->add(llvm::createGlobalDCEPass()); + + manager->run(*dep.linked_module); + delete manager; + } + + // Now that the LLVM module is built, build the device-specific + // representation + if (!dep.program->build(dep.linked_module, &dep.unlinked_binary)) + { + if (pfn_notify) + pfn_notify((cl_program)this, user_data); + + return CL_BUILD_PROGRAM_FAILURE; + } + } + + // TODO: Asynchronous compile + if (pfn_notify) + pfn_notify((cl_program)this, user_data); + + p_state = Built; + + return CL_SUCCESS; +} + +Program::Type Program::type() const +{ + return p_type; +} + +Program::State Program::state() const +{ + return p_state; +} + +cl_int Program::info(cl_program_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + llvm::SmallVector<size_t, 4> binary_sizes; + llvm::SmallVector<DeviceInterface *, 4> devices; + + union { + cl_uint cl_uint_var; + cl_context cl_context_var; + }; + + switch (param_name) + { + case CL_PROGRAM_REFERENCE_COUNT: + SIMPLE_ASSIGN(cl_uint, references()); + break; + + case CL_PROGRAM_NUM_DEVICES: + // Use devices associated with any built kernels, otherwise use + // the devices associated with the program context + if (p_device_dependent.size() != 0) + { SIMPLE_ASSIGN(cl_uint, p_device_dependent.size()); } + else + return ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES, + param_value_size, param_value, param_value_size_ret); + break; + + case CL_PROGRAM_DEVICES: + // Use devices associated with any built kernels, otherwise use + // the devices associated with the program context + if (p_device_dependent.size() != 0) + { + for (size_t i=0; i<p_device_dependent.size(); ++i) + { + const DeviceDependent &dep = p_device_dependent[i]; + + devices.push_back(dep.device); + } + + value = devices.data(); + value_length = devices.size() * sizeof(DeviceInterface *); + } + else + return ((Context *)parent())->info(CL_CONTEXT_DEVICES, + param_value_size, param_value, param_value_size_ret); + break; + + case CL_PROGRAM_CONTEXT: + SIMPLE_ASSIGN(cl_context, parent()); + break; + + case CL_PROGRAM_SOURCE: + MEM_ASSIGN(p_source.size() + 1, p_source.c_str()); + break; + + case CL_PROGRAM_BINARY_SIZES: + for (size_t i=0; i<p_device_dependent.size(); ++i) + { + const DeviceDependent &dep = p_device_dependent[i]; + + binary_sizes.push_back(dep.unlinked_binary.size()); + } + + value = binary_sizes.data(); + value_length = binary_sizes.size() * sizeof(size_t); + break; + + case CL_PROGRAM_BINARIES: + { + // Special case : param_value points to an array of p_num_devices + // application-allocated unsigned char* pointers. Check it's good + // and std::memcpy the data + + unsigned char **binaries = (unsigned char **)param_value; + value_length = p_device_dependent.size() * sizeof(unsigned char *); + + if (param_value && param_value_size >= value_length) + for (size_t i=0; i<p_device_dependent.size(); ++i) + { + const DeviceDependent &dep = p_device_dependent[i]; + unsigned char *dest = binaries[i]; + + if (!dest) + continue; + + std::memcpy(dest, dep.unlinked_binary.data(), + dep.unlinked_binary.size()); + } + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + return CL_SUCCESS; + } + + default: + return CL_INVALID_VALUE; + } + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} + +cl_int Program::buildInfo(DeviceInterface *device, + cl_program_build_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + const void *value = 0; + size_t value_length = 0; + const DeviceDependent &dep = deviceDependent(device); + + union { + cl_build_status cl_build_status_var; + }; + + switch (param_name) + { + case CL_PROGRAM_BUILD_STATUS: + switch (p_state) + { + case Empty: + case Loaded: + SIMPLE_ASSIGN(cl_build_status, CL_BUILD_NONE); + break; + case Built: + SIMPLE_ASSIGN(cl_build_status, CL_BUILD_SUCCESS); + break; + case Failed: + SIMPLE_ASSIGN(cl_build_status, CL_BUILD_ERROR); + break; + // TODO: CL_BUILD_IN_PROGRESS + } + break; + + case CL_PROGRAM_BUILD_OPTIONS: + value = dep.compiler->options().c_str(); + value_length = dep.compiler->options().size() + 1; + break; + + case CL_PROGRAM_BUILD_LOG: + value = dep.compiler->log().c_str(); + value_length = dep.compiler->log().size() + 1; + break; + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} diff --git a/src/core/program.h b/src/core/program.h new file mode 100644 index 0000000..a06b452 --- /dev/null +++ b/src/core/program.h @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file core/program.h + * \brief Program + */ + +#ifndef __PROGRAM_H__ +#define __PROGRAM_H__ + +#include "object.h" + +#include <CL/cl.h> +#include <string> +#include <vector> + +namespace llvm +{ + class MemoryBuffer; + class Module; + class Function; +} + +namespace Coal +{ + +class Context; +class Compiler; +class DeviceInterface; +class DeviceProgram; +class Kernel; + +/** + * \brief Program object + * + * This class compiles and links a source or binaries into LLVM modules for each + * \c Coal::DeviceInterface for which the program is built. + * + * It then contains functions to get the list of kernels available in the + * program, using \c Coal::Kernel objects. + */ +class Program : public Object +{ + public: + /** + * \brief Constructor + * \param ctx parent \c Coal::Context + */ + Program(Context *ctx); + ~Program(); + + /** + * \brief Program type + */ + enum Type + { + Invalid, /*!< Invalid or unknown, type of a program not already loaded */ + Source, /*!< Program made of sources that must be compiled and linked */ + Binary /*!< Program made of pre-built binaries that only need to be (transformed)/linked */ + }; + + /** + * \brief Program state + */ + enum State + { + Empty, /*!< Just created */ + Loaded, /*!< Source or binary loaded */ + Built, /*!< Built */ + Failed, /*!< Build failed */ + }; + + /** + * \brief Load sources into the program + * + * This function loads the source-code given in \p strings into the + * program and sets its type to \c Source. + * + * \param count number of strings in \p strings + * \param strings array of pointers to strings, either null-terminated + * or of length given in \p lengths + * \param lengths lengths of the strings. If a field is 0, the + * corresponding string is null-terminated. If \p lengths is + * 0, all the strings are null-terminated + * \return \c CL_SUCCESS if success, an error code otherwise + */ + cl_int loadSources(cl_uint count, const char **strings, + const size_t *lengths); + + /** + * \brief Load binaries into the program + * + * This function allows client application to load a source, retrieve + * binaries using \c buildInfo(), and then re-create the same program + * (after a restart for example) by giving it a precompiled binary. + * + * This function loads the binaries for each device and parse them into + * LLVM modules, then sets the program type to \c Binary or + * \c NativeBinary. + * + * \param data array of pointers to binaries, one for each device + * \param lengths lengths of the binaries pointed to by \p data + * \param binary_status array that will be filled by this function with + * the status of each loaded binary (\c CL_SUCCESS if success) + * \param num_devices number of devices for which a binary is loaded + * \param device_list list of devices for which the binaries are loaded + * \return \c CL_SUCCESS if success, an error code otherwise + */ + cl_int loadBinaries(const unsigned char **data, const size_t *lengths, + cl_int *binary_status, cl_uint num_devices, + DeviceInterface * const*device_list); + + /** + * \brief Build the program + * + * This function compiles the sources, if any, and then link the + * resulting binaries if the devices for which they are compiled asks + * \c Coal::Program to do so, using \c Coal::DeviceProgram::linkStdLib(). + * + * \param options options to pass to the compiler, see the OpenCL + * specification. + * \param pfn_notify callback function called at the end of the build + * \param user_data user data given to \p pfn_notify + * \param num_devices number of devices for which binaries are being + * built. If it's a source-based program, this can be 0. + * \param device_list list of devices for which the program will be built. + * \return \c CL_SUCCESS if success, an error code otherwise + */ + cl_int build(const char *options, + void (CL_CALLBACK *pfn_notify)(cl_program program, + void *user_data), + void *user_data, cl_uint num_devices, + DeviceInterface * const*device_list); + + Type type() const; /*!< \brief Type of the program */ + State state() const; /*!< \brief State of the program */ + + /** + * \brief Create a kernel given a \p name + * \param name name of the kernel to be created + * \param errcode_ret return code (\c CL_SUCCESS if success) + * \return a \c Coal::Kernel object corresponding to the given \p name + */ + Kernel *createKernel(const std::string &name, cl_int *errcode_ret); + + /** + * \brief Create kernels of the program and return given a \p name + * \param name name of the kernel to be returned + * \param errcode_ret return code (\c CL_SUCCESS if success) + * \return a \c Coal::Kernel object corresponding to the given \p name + */ + Kernel *createKernelsAndReturnKernel(const std::string &name, cl_int *errcode_ret); + + /** + * \brief Create all the kernels of the program + * \param errcode_ret return code (\c CL_SUCCESS if success) + * \return the list of \c Coal::Kernel objects of this program + */ + std::vector<Kernel *> createKernels(cl_int *errcode_ret); + + /** + * \brief Device-specific program + * \param device device for which the device-specific program is needed + * \return the device-specific program requested, 0 if not found + */ + DeviceProgram *deviceDependentProgram(DeviceInterface *device) const; + std::string deviceDependentCompilerOptions(DeviceInterface *device) const; + + /** + * \brief Get information about this program + * \copydetails Coal::DeviceInterface::info + */ + cl_int info(cl_program_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + /** + * \brief Get build info about this program (log, binaries, etc) + * \copydetails Coal::DeviceInterface::info + * \param device \c Coal::DeviceInterface for which info is needed + */ + cl_int buildInfo(DeviceInterface *device, + cl_program_build_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + std::string source() { return p_source; } + + std::vector<Kernel *> kernelList; + std::vector<Kernel *> kernelReleasedList; + + private: + Type p_type; + State p_state; + std::string p_source; + + struct DeviceDependent + { + DeviceInterface * device; + DeviceProgram * program; + std::string unlinked_binary; + bool is_native_binary; // llvm kernel bitcode vs final native binary + llvm::Module * linked_module; + Compiler * compiler; + }; + + std::vector<DeviceDependent> p_device_dependent; + DeviceDependent p_null_device_dependent; + + void setDevices(cl_uint num_devices, DeviceInterface * const*devices); + void resetDeviceDependent(); + DeviceDependent &deviceDependent(DeviceInterface *device); + const DeviceDependent &deviceDependent(DeviceInterface *device) const; + std::vector<llvm::Function *> kernelFunctions(DeviceDependent &dep); +}; + +} + +struct _cl_program : public Coal::Program +{}; + +#endif diff --git a/src/core/propertylist.h b/src/core/propertylist.h new file mode 100644 index 0000000..8d32397 --- /dev/null +++ b/src/core/propertylist.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file propertylist.h + * \brief Helper macros for \c info() functions + * + * The OpenCL API is full of functions like \c clGetXXXInfo(). They all take + * the same arguments and are handled the same way. This file contains macros + * easing the implementation of these info functions. + * + * One info function, using these macros, looks like that: + * + * \code + * cl_int Foo::info(cl_foo_info param_name, + * size_t param_value_size, + * void *param_value, + * size_t *param_value_size_ret) const + * { + * void *value = 0; + * size_t value_length = 0; + * + * union { + * cl_uint cl_uint_var; + * cl_context cl_context_var; + * }; + * + * switch (param_name) + * { + * case CL_UINT_PARAM: + * SIMPLE_ASSIGN(cl_uint, the_value); + * break; + * case CL_CONTEXT_PARAM: + * SIMPLE_ASSIGN(cl_context, a_call()); + * break; + * case CL_STRING_PARAM: + * STRING_ASSIGN("This is a string"); + * break; + * case CL_BINARY_PARAM: + * MEM_ASSIGN(sizeof(something), something); + * break; + * default: + * return CL_INVALID_VALUE; + * } + * + * if (param_value && param_value_size < value_length) + * return CL_INVALID_VALUE; + * + * if (param_value_size_ret) + * *param_value_size_ret = value_length; + * + * if (param_value) + * std::memcpy(param_value, value, value_length); + * + * return CL_SUCCESS; + * } + * \endcode + */ + +#ifndef __PROPERTYLIST_H__ +#define __PROPERTYLIST_H__ + +/** + * \brief Assign a value of a given type to the return value + * \param type type of the argument + * \param _value value to assign + */ +#define SIMPLE_ASSIGN(type, _value) do { \ + value_length = sizeof(type); \ + type##_var = (type)_value; \ + value = & type##_var; \ +} while (0); + +/** + * \brief Assign a string to the return value + * \param string the string to assign, as a constant + */ +#define STRING_ASSIGN(string) do { \ + static const char str[] = string; \ + value_length = sizeof(str); \ + value = (void *)str; \ +} while (0); + +/** + * \brief Assign a memory buffer to the return value + * \note the buffer must remain valid after the end of the \c info() call + * \param size size of the buffer + * \param buf buffer (of type <tt>void *</tt> for instance) + */ +#define MEM_ASSIGN(size, buf) do { \ + value_length = size; \ + value = (void *)buf; \ +} while (0); + +#endif diff --git a/src/core/sampler.cpp b/src/core/sampler.cpp new file mode 100644 index 0000000..71fca86 --- /dev/null +++ b/src/core/sampler.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file core/sampler.cpp + * \brief Sampler + */ + +#include "sampler.h" +#include "context.h" +#include "deviceinterface.h" +#include "propertylist.h" + +#include <cstring> +#include <cstdlib> + +using namespace Coal; + +Sampler::Sampler(Context *ctx, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int *errcode_ret) +: Object(Object::T_Sampler, ctx), p_bitfield(0) +{ + if (normalized_coords) + p_bitfield |= CLK_NORMALIZED_COORDS_TRUE; + else + p_bitfield |= CLK_NORMALIZED_COORDS_FALSE; + + switch (addressing_mode) + { + case CL_ADDRESS_NONE: + p_bitfield |= CLK_ADDRESS_NONE; + break; + + case CL_ADDRESS_MIRRORED_REPEAT: + p_bitfield |= CLK_ADDRESS_MIRRORED_REPEAT; + break; + + case CL_ADDRESS_REPEAT: + p_bitfield |= CLK_ADDRESS_REPEAT; + break; + + case CL_ADDRESS_CLAMP_TO_EDGE: + p_bitfield |= CLK_ADDRESS_CLAMP_TO_EDGE; + break; + + case CL_ADDRESS_CLAMP: + p_bitfield |= CLK_ADDRESS_CLAMP; + break; + + default: + *errcode_ret = CL_INVALID_VALUE; + return; + } + + switch (filter_mode) + { + case CL_FILTER_NEAREST: + p_bitfield |= CLK_FILTER_NEAREST; + break; + + case CL_FILTER_LINEAR: + p_bitfield |= CLK_FILTER_LINEAR; + break; + + default: + *errcode_ret = CL_INVALID_VALUE; + return; + } + + // Check that images are available on all the devices + *errcode_ret = checkImageAvailability(); +} + +Sampler::Sampler(Context *ctx, unsigned int bitfield) +: Object(Object::T_Sampler, ctx), p_bitfield(bitfield) +{ + checkImageAvailability(); +} + +cl_int Sampler::checkImageAvailability() const +{ + cl_uint num_devices; + DeviceInterface **devices; + cl_int rs; + + rs = ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES, + sizeof(unsigned int), + &num_devices, 0); + + if (rs != CL_SUCCESS) + return rs; + + devices = (DeviceInterface **)std::malloc(num_devices * + sizeof(DeviceInterface *)); + + if (!devices) + { + return CL_OUT_OF_HOST_MEMORY; + } + + rs = ((Context *)parent())->info(CL_CONTEXT_DEVICES, + num_devices * sizeof(DeviceInterface *), + devices, 0); + + if (rs != CL_SUCCESS) + { + std::free((void *)devices); + return rs; + } + + for (unsigned int i=0; i<num_devices; ++i) + { + cl_bool image_support; + + rs = devices[i]->info(CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), + &image_support, 0); + + if (rs != CL_SUCCESS) + { + std::free((void *)devices); + return rs; + } + + if (!image_support) + { + std::free((void *)devices); + return CL_INVALID_OPERATION; + } + } + + std::free((void *)devices); + + return CL_SUCCESS; +} + +unsigned int Sampler::bitfield() const +{ + return p_bitfield; +} + +cl_int Sampler::info(cl_sampler_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const +{ + void *value = 0; + size_t value_length = 0; + + union { + cl_uint cl_uint_var; + cl_context cl_context_var; + cl_bool cl_bool_var; + cl_addressing_mode cl_addressing_mode_var; + cl_filter_mode cl_filter_mode_var; + }; + + switch (param_name) + { + case CL_SAMPLER_REFERENCE_COUNT: + SIMPLE_ASSIGN(cl_uint, references()); + break; + + case CL_SAMPLER_CONTEXT: + SIMPLE_ASSIGN(cl_context, parent()); + break; + + case CL_SAMPLER_NORMALIZED_COORDS: + if (p_bitfield & CLK_NORMALIZED_COORDS_MASK) + SIMPLE_ASSIGN(cl_bool, true) + else + SIMPLE_ASSIGN(cl_bool, false); + break; + + case CL_SAMPLER_ADDRESSING_MODE: + switch (p_bitfield & CLK_ADDRESS_MODE_MASK) + { + case CLK_ADDRESS_CLAMP: + SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_CLAMP); + break; + case CLK_ADDRESS_CLAMP_TO_EDGE: + SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_CLAMP_TO_EDGE); + break; + case CLK_ADDRESS_MIRRORED_REPEAT: + SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_MIRRORED_REPEAT); + break; + case CLK_ADDRESS_REPEAT: + SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_REPEAT); + break; + case CLK_ADDRESS_NONE: + SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_NONE); + break; + } + break; + + case CL_SAMPLER_FILTER_MODE: + switch (p_bitfield & CLK_FILTER_MASK) + { + case CLK_FILTER_LINEAR: + SIMPLE_ASSIGN(cl_filter_mode, CL_FILTER_LINEAR); + break; + case CLK_FILTER_NEAREST: + SIMPLE_ASSIGN(cl_filter_mode, CL_FILTER_NEAREST); + break; + } + + default: + return CL_INVALID_VALUE; + } + + if (param_value && param_value_size < value_length) + return CL_INVALID_VALUE; + + if (param_value_size_ret) + *param_value_size_ret = value_length; + + if (param_value) + std::memcpy(param_value, value, value_length); + + return CL_SUCCESS; +} diff --git a/src/core/sampler.h b/src/core/sampler.h new file mode 100644 index 0000000..1ff1f1f --- /dev/null +++ b/src/core/sampler.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file sampler.h + * \brief Sampler object + */ + +#ifndef __SAMPLER_H__ +#define __SAMPLER_H__ + +#include <CL/cl.h> +#include "object.h" + +// WARNING: Keep in sync with stdlib.h + +#define CLK_NORMALIZED_COORDS_FALSE 0x00000000 +#define CLK_NORMALIZED_COORDS_TRUE 0x00000001 +#define CLK_ADDRESS_NONE 0x00000000 +#define CLK_ADDRESS_MIRRORED_REPEAT 0x00000010 +#define CLK_ADDRESS_REPEAT 0x00000020 +#define CLK_ADDRESS_CLAMP_TO_EDGE 0x00000030 +#define CLK_ADDRESS_CLAMP 0x00000040 +#define CLK_FILTER_NEAREST 0x00000000 +#define CLK_FILTER_LINEAR 0x00000100 + +#define CLK_NORMALIZED_COORDS_MASK 0x0000000f +#define CLK_ADDRESS_MODE_MASK 0x000000f0 +#define CLK_FILTER_MASK 0x00000f00 + +namespace Coal +{ + +class Context; + +/** + * \brief Sampler + * + * This object doesn't do anything intersting, it only converts a set of + * host OpenCL constants to constants that will be used by the kernels and + * the image reading and writing built-in functions. + */ +class Sampler : public Object +{ + public: + /** + * \brief Constructor + * \param ctx parent \c Coal::Context + * \param normalized_coords true if the coords given to the built-in + * image functions are normalized, false otherwise + * \param addressing_mode addressing mode used to read images + * \param filter_mode filter mode used to read images + * \param errcode_ret return code (\c CL_SUCCESS if all is good) + */ + Sampler(Context *ctx, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int *errcode_ret); + + /** + * \brief Simpler constructor + * \param ctx parent \c Coal::Context + * \param bitfield bitfield already calculated + */ + Sampler(Context *ctx, + unsigned int bitfield); + + unsigned int bitfield() const; /*!< \brief Bitfield value usable by the kernels */ + + /** + * \brief Get information about the sampler + * \copydetails Coal::DeviceInterface::info + */ + cl_int info(cl_sampler_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) const; + + private: + unsigned int p_bitfield; + + cl_int checkImageAvailability() const; +}; + +} + +struct _cl_sampler : public Coal::Sampler +{}; + +#endif diff --git a/src/core/util.cpp b/src/core/util.cpp new file mode 100644 index 0000000..afeb564 --- /dev/null +++ b/src/core/util.cpp @@ -0,0 +1,68 @@ +/****************************************************************************** + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +/** + * \file core/util.c + * \brief misc utils + */ + +#include <stdint.h> +#include <ctype.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +#include "util.h" + +/****************************************************************************** +* Parse first line in a file, read integer immediately following a string +******************************************************************************/ +uint32_t parse_file_line_value(const char *fname, const char *sname, + uint32_t default_val) +{ + uint32_t val = default_val; + FILE *fp = NULL; + char *line = NULL; + char *str = NULL; + size_t len = 0; + + if ((fp = fopen(fname, "r")) == NULL) return val; + if (getline(&line, &len, fp) != -1) + { + if ((str = strstr(line, sname)) != NULL) + { + str += strlen(sname); + while(!isdigit(*str) && *str != '\0') str++; + if (*str != '\0') val = atoi(str); + } + } + + if (fp != NULL) fclose(fp); + if (line != NULL) free(line); + return val; +} + diff --git a/src/core/util.h b/src/core/util.h new file mode 100644 index 0000000..f2c1609 --- /dev/null +++ b/src/core/util.h @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +/** + * \file core/util.h + * \brief misc utils + */ + +#ifndef _UTIL_H +#define _UTIL_H + +// Parse first line in a file, read integer immediately following a string +uint32_t parse_file_line_value(const char *fname, const char *sname, + uint32_t default_val); + +#endif // _UTIL_H + diff --git a/src/llvmopencl/AllocasToEntry.cc b/src/llvmopencl/AllocasToEntry.cc new file mode 100644 index 0000000..79bbe63 --- /dev/null +++ b/src/llvmopencl/AllocasToEntry.cc @@ -0,0 +1,74 @@ +// Header for AllocasToEntry, an LLVM pass to move allocas to the function +// entry node. +// +// Copyright (c) 2013 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include <sstream> +#include <iostream> + +#ifdef LLVM_3_2 +# include <llvm/Instructions.h> +#else +# include <llvm/IR/Instructions.h> +#endif + +#include "AllocasToEntry.h" + +namespace pocl { + +using namespace llvm; + +namespace { + static + RegisterPass<pocl::AllocasToEntry> X("allocastoentry", + "Move allocas to the function entry node."); +} + +char AllocasToEntry::ID = 0; + + +AllocasToEntry::AllocasToEntry() : FunctionPass(ID) +{ +} + +bool +AllocasToEntry::runOnFunction(Function &F) +{ + // This solves problem with dynamic stack objects that are + // not supported by some targets (TCE). + Function::iterator I = F.begin(); + Instruction *firstInsertionPt = (I++)->getFirstInsertionPt(); + + bool changed = false; + for (Function::iterator E = F.end(); I != E; ++I) { + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++); + if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) { + allocaInst->moveBefore(firstInsertionPt); + changed = true; + } + } + } + return changed; +} + +} diff --git a/src/llvmopencl/AllocasToEntry.h b/src/llvmopencl/AllocasToEntry.h new file mode 100644 index 0000000..a92fa14 --- /dev/null +++ b/src/llvmopencl/AllocasToEntry.h @@ -0,0 +1,49 @@ +// Header for AllocasToEntry, an LLVM pass to move allocas to the function +// entry node. +// +// Copyright (c) 2013 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_ALLOCAS_TO_ENTRY_H +#define _POCL_ALLOCAS_TO_ENTRY_H + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Function.h" +#else +#include "llvm/IR/Function.h" +#endif + +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" + +namespace pocl { + class AllocasToEntry : public llvm::FunctionPass { + public: + static char ID; + + AllocasToEntry(); + virtual ~AllocasToEntry() {}; + + virtual bool runOnFunction(llvm::Function &F); + }; +} + +#endif diff --git a/src/llvmopencl/Barrier.h b/src/llvmopencl/Barrier.h new file mode 100644 index 0000000..e1b612f --- /dev/null +++ b/src/llvmopencl/Barrier.h @@ -0,0 +1,121 @@ +// Class for barrier instructions, modelled as a CallInstr. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include <cstdio> + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Instructions.h" +#include "llvm/Function.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#endif + +#include "llvm/Support/Casting.h" + +#define BARRIER_FUNCTION_NAME "barrier" + +namespace pocl { + + class Barrier : public llvm::CallInst { + + public: + static void GetBarriers(llvm::SmallVectorImpl<Barrier *> &B, + llvm::Module &M) { + llvm::Function *F = M.getFunction(BARRIER_FUNCTION_NAME); + if (F != NULL) { + for (llvm::Function::use_iterator i = F->use_begin(), e = F->use_end(); + i != e; ++i) + B.push_back(llvm::cast<Barrier>(*i)); + } + } + /** + * Creates a new barrier before the given instruction. + * + * If there was already a barrier there, returns the old one. + */ + static Barrier *Create(llvm::Instruction *InsertBefore) { + llvm::Module *M = InsertBefore->getParent()->getParent()->getParent(); + + if (InsertBefore != &InsertBefore->getParent()->front() && + llvm::isa<Barrier>(InsertBefore->getPrevNode())) + return llvm::cast<Barrier>(InsertBefore->getPrevNode()); + + llvm::Type *Int32Type = llvm::Type::getInt32Ty(M->getContext()); + llvm::Function *F = llvm::cast<llvm::Function> + (M->getOrInsertFunction(BARRIER_FUNCTION_NAME, + llvm::Type::getVoidTy(M->getContext()), + Int32Type, + NULL)); + llvm::SmallVector<llvm::Value *, 4> argsarray; + argsarray.push_back(llvm::ConstantInt::get(Int32Type, 0)); + llvm::ArrayRef<llvm::Value *> args(argsarray); + return llvm::cast<pocl::Barrier> + (llvm::CallInst::Create(F, args, "", InsertBefore)); + } + static bool classof(const Barrier *) { return true; }; + static bool classof(const llvm::CallInst *C) { + return C->getCalledFunction() != NULL && + C->getCalledFunction()->getName() == BARRIER_FUNCTION_NAME; + } + static bool classof(const Instruction *I) { + return (llvm::isa<llvm::CallInst>(I) && + classof(llvm::cast<llvm::CallInst>(I))); + } + static bool classof(const User *U) { + return (llvm::isa<Instruction>(U) && + classof(llvm::cast<llvm::Instruction>(U))); + } + + + static bool hasOnlyBarrier(const llvm::BasicBlock *bb) + { + return endsWithBarrier(bb) && bb->size() == 2; + } + + static bool hasBarrier(const llvm::BasicBlock *bb) + { + for (llvm::BasicBlock::const_iterator i = bb->begin(), e = bb->end(); + i != e; ++i) + { + if (llvm::isa<Barrier>(i)) return true; + } + return false; + } + + // returns true in case the given basic block ends with a barrier, + // that is, contains only a branch instruction after a barrier call + static bool endsWithBarrier(const llvm::BasicBlock *bb) + { + const llvm::TerminatorInst *t = bb->getTerminator(); + if (t == NULL) return false; + return bb->size() > 1 && t->getPrevNode() != NULL && + llvm::isa<Barrier>(t->getPrevNode()); + } + }; + +} + diff --git a/src/llvmopencl/BarrierBlock.cc b/src/llvmopencl/BarrierBlock.cc new file mode 100644 index 0000000..d254fa6 --- /dev/null +++ b/src/llvmopencl/BarrierBlock.cc @@ -0,0 +1,73 @@ +// Class for a basic block that just contains a barrier. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "BarrierBlock.h" +#include "Barrier.h" +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Instructions.h" +#else +#include "llvm/IR/Instructions.h" +#endif +#include <cassert> + +using namespace llvm; +using namespace pocl; + +static bool +verify(const BasicBlock *B); + +bool +BarrierBlock::classof(const BasicBlock *B) +{ + if ((B->size() == 2) && + isa<Barrier> (&B->front())) { + assert(verify(B)); + return true; + } + + return false; +} + +static bool +verify(const BasicBlock *B) +{ + assert((B->size() == 2) && "Barriers blocks should have no functionality!"); + // const Instruction *barrier = B->getFirstNonPHI(); + // assert(isa<Barrier>(barrier) && "Barriers blocks should have no functionality!"); + // assert(B->getTerminator()->getPrevNode() == barrier && + // "Barriers blocks should have no functionality!"); +#if 1 // We want to allow barriers with more than one predecessors (?) + // (for loop header barriers). + assert(((B->getSinglePredecessor() != NULL) || + (B == &(B->getParent()->front()))) && + "Barrier blocks should have exactly one predecessor (except entry barrier)!"); +#endif +#if 0 // We want to allow barriers with more than one successor (for latch barriers). + assert((B->getTerminator()->getNumSuccessors() <= 1) && + "Barrier blocks should have one successor, or zero for exit barriers!"); +#endif + assert(isa<Barrier>(B->front())); + + return true; +} + diff --git a/src/llvmopencl/BarrierBlock.h b/src/llvmopencl/BarrierBlock.h new file mode 100644 index 0000000..6246751 --- /dev/null +++ b/src/llvmopencl/BarrierBlock.h @@ -0,0 +1,44 @@ +// Class for a basic block that just contains a barrier. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/BasicBlock.h" +#else +#include "llvm/IR/BasicBlock.h" +#endif + +#ifndef _POCL_BARRIER_BLOCK_H +#define _POCL_BARRIER_BLOCK_H + +namespace pocl { + + class BarrierBlock : public llvm::BasicBlock { + + public: + static bool classof(const BarrierBlock *) { return true; }; + static bool classof(const llvm::BasicBlock *B); + }; + +} + +#endif diff --git a/src/llvmopencl/BarrierTailReplication.cc b/src/llvmopencl/BarrierTailReplication.cc new file mode 100644 index 0000000..12bac74 --- /dev/null +++ b/src/llvmopencl/BarrierTailReplication.cc @@ -0,0 +1,421 @@ +// LLVM function pass to replicate barrier tails (successors to barriers). +// +// Copyright (c) 2011 Universidad Rey Juan Carlos and +// 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include "BarrierTailReplication.h" +#include "Barrier.h" +#include "Workgroup.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/InstrTypes.h" +#include "llvm/Instructions.h" +#else +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#endif + +#include <iostream> +#include <algorithm> + +using namespace llvm; +using namespace pocl; + +//#define DEBUG_BARRIER_REPL + +static bool block_has_barrier(const BasicBlock *bb); + +namespace { + static + RegisterPass<BarrierTailReplication> X("barriertails", + "Barrier tail replication pass"); +} + +char BarrierTailReplication::ID = 0; + +void +BarrierTailReplication::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); +} + +bool +BarrierTailReplication::runOnFunction(Function &F) +{ + if (!Workgroup::isKernelToProcess(F)) + return false; + +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### BTR on " << F.getName().str() << std::endl; +#endif + + DT = &getAnalysis<DominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + + bool changed = ProcessFunction(F); + + DT->verifyAnalysis(); + LI->verifyAnalysis(); + + /* The created tails might contain PHI nodes with operands + referring to the non-predecessor (split point) BB. + These must be cleaned to avoid breakage later on. + */ + for (Function::iterator i = F.begin(), e = F.end(); + i != e; ++i) + { + llvm::BasicBlock *bb = i; + changed |= CleanupPHIs(bb); + } + + return changed; +} + +bool +BarrierTailReplication::ProcessFunction(Function &F) +{ + BasicBlockSet processed_bbs; + + return FindBarriersDFS(&F.getEntryBlock(), processed_bbs); +} + + +// Recursively (depht-first) look for barriers in all possible +// execution paths starting on entry, replicating the barrier +// successors to ensure there is a separate function exit BB +// for each combination of traversed barriers. The set +// processed_bbs stores the +bool +BarrierTailReplication::FindBarriersDFS(BasicBlock *bb, + BasicBlockSet &processed_bbs) +{ + bool changed = false; + + // Check if we already visited this BB (to avoid + // infinite recursion in case of unbarriered loops). + if (processed_bbs.count(bb) != 0) + return changed; + + processed_bbs.insert(bb); + + if (block_has_barrier(bb)) { +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### block " << bb->getName().str() << " has barrier, RJS" << std::endl; +#endif + BasicBlockSet processed_bbs_rjs; + changed = ReplicateJoinedSubgraphs(bb, bb, processed_bbs_rjs); + } + + TerminatorInst *t = bb->getTerminator(); + + // Find barriers in the successors (depth first). + for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) + changed |= FindBarriersDFS(t->getSuccessor(i), processed_bbs); + + return changed; +} + + +// Only replicate those parts of the subgraph that are not +// dominated by a (barrier) basic block, to avoid excesive +// (and confusing) code replication. +bool +BarrierTailReplication::ReplicateJoinedSubgraphs(BasicBlock *dominator, + BasicBlock *subgraph_entry, + BasicBlockSet &processed_bbs) +{ + bool changed = false; + + assert(DT->dominates(dominator, subgraph_entry)); + + Function *f = dominator->getParent(); + + TerminatorInst *t = subgraph_entry->getTerminator(); + for (int i = 0, e = t->getNumSuccessors(); i != e; ++i) { + BasicBlock *b = t->getSuccessor(i); +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### traversing from " << subgraph_entry->getName().str() + << " to " << b->getName().str() << std::endl; +#endif + + // Check if we already handled this BB and all its branches. + if (processed_bbs.count(b) != 0) + { +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### already processed " << std::endl; +#endif + continue; + } + + const bool isBackedge = DT->dominates(b, subgraph_entry); + if (isBackedge) { + // This is a loop backedge. Do not find subgraphs across + // those. +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### a loop backedge, skipping" << std::endl; +#endif + continue; + } + if (DT->dominates(dominator, b)) + { +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### " << dominator->getName().str() << " dominates " + << b->getName().str() << std::endl; +#endif + changed |= ReplicateJoinedSubgraphs(dominator, b, processed_bbs); + } + else + { +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### " << dominator->getName().str() << " does not dominate " + << b->getName().str() << " replicating " << std::endl; +#endif + BasicBlock *replicated_subgraph_entry = + ReplicateSubgraph(b, f); + t->setSuccessor(i, replicated_subgraph_entry); + changed = true; + } + + if (changed) + { + // We have modified the function. Possibly created new loops. + // Update analysis passes. + DT->runOnFunction(*f); + #ifdef LLVM_3_1 + LI->getBase().Calculate(DT->getBase()); + #else + LI->runOnFunction(*f); + #endif + } + } + processed_bbs.insert(subgraph_entry); + return changed; +} + +// Removes phi elements for which there are no successors (anymore). +bool +BarrierTailReplication::CleanupPHIs(llvm::BasicBlock *BB) +{ + + bool changed = false; +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### CleanupPHIs for BB:" << std::endl; + BB->dump(); +#endif + + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ) + { + PHINode *PN = dyn_cast<PHINode>(BI); + if (PN == NULL) break; + + bool PHIRemoved = false; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) + { + bool isSuccessor = false; + // find if the predecessor branches to this one (anymore) + for (unsigned s = 0, + se = PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors(); + s < se; ++s) { + if (PN->getIncomingBlock(i)->getTerminator()->getSuccessor(s) == BB) + { + isSuccessor = true; + break; + } + } + if (!isSuccessor) + { +#ifdef DEBUG_BARRIER_REPL + std::cerr << "removing incoming value " << i << " from PHINode:" << std::endl; + PN->dump(); +#endif + PN->removeIncomingValue(i, true); +#ifdef DEBUG_BARRIER_REPL + std::cerr << "now:" << std::endl; + PN->dump(); +#endif + changed = true; + e--; + if (e == 0) + { + PHIRemoved = true; + break; + } + i = 0; + continue; + } + } + if (PHIRemoved) + BI = BB->begin(); + else + BI++; + } + return changed; +} + +BasicBlock * +BarrierTailReplication::ReplicateSubgraph(BasicBlock *entry, + Function *f) +{ + // Find all basic blocks to replicate. + BasicBlockVector subgraph; + FindSubgraph(subgraph, entry); + + // Replicate subgraph maintaining control flow. + BasicBlockVector v; + + ValueToValueMapTy m; + ReplicateBasicBlocks(v, m, subgraph, f); + UpdateReferences(v, m); + + // Return entry block of replicated subgraph. + return cast<BasicBlock>(m[entry]); +} + + +void +BarrierTailReplication::FindSubgraph(BasicBlockVector &subgraph, + BasicBlock *entry) +{ + // The subgraph can have internal branches (join points) + // avoid replicating these parts multiple times within the + // same tail. + if (std::count(subgraph.begin(), subgraph.end(), entry) > 0) + return; + + subgraph.push_back(entry); + + const TerminatorInst *t = entry->getTerminator(); + Loop *l = LI->getLoopFor(entry); + for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) { + BasicBlock *successor = t->getSuccessor(i); + const bool isBackedge = DT->dominates(successor, entry); + if (isBackedge) continue; + FindSubgraph(subgraph, successor); + } +} + + +void +BarrierTailReplication::ReplicateBasicBlocks(BasicBlockVector &new_graph, + ValueToValueMapTy &reference_map, + BasicBlockVector &graph, + Function *f) +{ +#ifdef DEBUG_BARRIER_REPL + std::cerr << "### ReplicateBasicBlocks: " << std::endl; +#endif + for (BasicBlockVector::const_iterator i = graph.begin(), + e = graph.end(); + i != e; ++i) { + BasicBlock *b = *i; + BasicBlock *new_b = BasicBlock::Create(b->getContext(), + b->getName() + ".btr", + f); + reference_map.insert(std::make_pair(b, new_b)); + new_graph.push_back(new_b); + +#ifdef DEBUG_BARRIER_REPL + std::cerr << "Replicated BB: " << new_b->getName().str() << std::endl; +#endif + + for (BasicBlock::iterator i2 = b->begin(), e2 = b->end(); + i2 != e2; ++i2) { + Instruction *i = i2->clone(); + reference_map.insert(std::make_pair(i2, i)); + new_b->getInstList().push_back(i); + } + + // Add predicates to PHINodes of basic blocks the replicated + // block jumps to (backedges). + TerminatorInst *t = new_b->getTerminator(); + for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) { + BasicBlock *successor = t->getSuccessor(i); + if (std::count(graph.begin(), graph.end(), successor) == 0) { + // Successor is not in the graph, possible backedge. + for (BasicBlock::iterator i = successor->begin(), e = successor->end(); + i != e; ++i) { + PHINode *phi = dyn_cast<PHINode>(i); + if (phi == NULL) + break; // All PHINodes already checked. + + // Get value for original incoming edge and add new predicate. + Value *v = phi->getIncomingValueForBlock(b); + Value *new_v = reference_map[v]; + if (new_v == NULL) { + /* This case can happen at least when replicating a latch + block in a b-loop. The value produced might be from a common + path before the replicated part. Then just use the original value.*/ + new_v = v; +#if 0 + std::cerr << "### could not find a replacement block for phi node (" + << b->getName().str() << ")" << std::endl; + phi->dump(); + v->dump(); + f->viewCFG(); + assert (0); +#endif + } + phi->addIncoming(new_v, new_b); + } + } + } + } + +#ifdef DEBUG_BARRIER_REPL + std::cerr << std::endl; +#endif +} + + +void +BarrierTailReplication::UpdateReferences(const BasicBlockVector &graph, + ValueToValueMapTy &reference_map) +{ + for (BasicBlockVector::const_iterator i = graph.begin(), + e = graph.end(); + i != e; ++i) { + BasicBlock *b = *i; + for (BasicBlock::iterator i2 = b->begin(), e2 = b->end(); + i2 != e2; ++i2) { + Instruction *i = i2; + RemapInstruction(i, reference_map, + RF_IgnoreMissingEntries | RF_NoModuleLevelChanges); + } + } +} + + +static bool +block_has_barrier(const BasicBlock *bb) +{ + for (BasicBlock::const_iterator i = bb->begin(), e = bb->end(); + i != e; ++i) { + if (isa<Barrier>(i)) + return true; + } + + return false; +} diff --git a/src/llvmopencl/BarrierTailReplication.h b/src/llvmopencl/BarrierTailReplication.h new file mode 100644 index 0000000..7e3beb0 --- /dev/null +++ b/src/llvmopencl/BarrierTailReplication.h @@ -0,0 +1,85 @@ +// Header for BarrierTailReplication.cc function pass. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef POCL_BARRIER_TAIL_REPLICATION +#define POCL_BARRIER_TAIL_REPLICATION + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Function.h" +#else +#include "llvm/IR/Function.h" +#endif + +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include <map> +#include <set> + +namespace pocl { + class Workgroup; + + class BarrierTailReplication : public llvm::FunctionPass { + + public: + static char ID; + + BarrierTailReplication(): FunctionPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnFunction(llvm::Function &F); + + private: + typedef std::set<llvm::BasicBlock *> BasicBlockSet; + typedef std::vector<llvm::BasicBlock *> BasicBlockVector; + typedef std::map<llvm::Value *, llvm::Value *> ValueValueMap; + + llvm::DominatorTree *DT; + llvm::LoopInfo *LI; + + bool ProcessFunction(llvm::Function &F); + bool FindBarriersDFS(llvm::BasicBlock *bb, + BasicBlockSet &processed_bbs); + bool ReplicateJoinedSubgraphs(llvm::BasicBlock *dominator, + llvm::BasicBlock *subgraph_entry, + BasicBlockSet &processed_bbs); + + llvm::BasicBlock* ReplicateSubgraph(llvm::BasicBlock *entry, + llvm::Function *f); + void FindSubgraph(BasicBlockVector &subgraph, + llvm::BasicBlock *entry); + void ReplicateBasicBlocks(BasicBlockVector &new_graph, + llvm::ValueToValueMapTy &reference_map, + BasicBlockVector &graph, + llvm::Function *f); + void UpdateReferences(const BasicBlockVector &graph, + llvm::ValueToValueMapTy &reference_map); + + bool CleanupPHIs(llvm::BasicBlock *BB); + + friend class pocl::Workgroup; + }; +} + +#endif diff --git a/src/llvmopencl/BreakConstantGEPs.cpp b/src/llvmopencl/BreakConstantGEPs.cpp new file mode 100644 index 0000000..a12aaaa --- /dev/null +++ b/src/llvmopencl/BreakConstantGEPs.cpp @@ -0,0 +1,326 @@ +//===- BreakConstantGEPs.cpp - Change constant GEPs into GEP instructions - --// +// +// pocl note: This pass is taken from The SAFECode project with trivial modifications. +// Automatic locals might cause constant GEPs which cause problems during +// converting the locals to kernel function arguments for thread safety. +// +// The SAFECode Compiler +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass changes all GEP constant expressions into GEP instructions. This +// permits the rest of SAFECode to put run-time checks on them if necessary. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "break-constgeps" + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Constants.h" +#include "llvm/InstrTypes.h" +#include "llvm/Instruction.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#else +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#endif +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/InstIterator.h" + +#include "BreakConstantGEPs.h" +#include "Workgroup.h" + +#include <iostream> +#include <map> +#include <utility> + +// Identifier variable for the pass +char BreakConstantGEPs::ID = 0; + +// Statistics +STATISTIC (GEPChanges, "Number of Converted GEP Constant Expressions"); +STATISTIC (TotalChanges, "Number of Converted Constant Expressions"); + +// Register the pass +static RegisterPass<BreakConstantGEPs> P ("break-constgeps", + "Remove GEP Constant Expressions"); + +// +// Function: hasConstantGEP() +// +// Description: +// This function determines whether the given value is a constant expression +// that has a constant GEP expression embedded within it. +// +// Inputs: +// V - The value to check. +// +// Return value: +// NULL - This value is not a constant expression with a constant expression +// GEP within it. +// ~NULL - A pointer to the value casted into a ConstantExpr is returned. +// +static ConstantExpr * +hasConstantGEP (Value * V) { + if (ConstantExpr * CE = dyn_cast<ConstantExpr>(V)) { + if (CE->getOpcode() == Instruction::GetElementPtr || + CE->getOpcode() == Instruction::BitCast) + { + return CE; + } else { + for (unsigned index = 0; index < CE->getNumOperands(); ++index) { + if (hasConstantGEP (CE->getOperand(index))) + return CE; + } + } + } + + return 0; +} + +// +// Function: convertGEP() +// +// Description: +// Convert a GEP constant expression into a GEP instruction. +// +// Inputs: +// CE - The GEP constant expression. +// InsertPt - The instruction before which to insert the new GEP instruction. +// +// Return value: +// A pointer to the new GEP instruction is returned. +// +static Instruction * +convertGEP (ConstantExpr * CE, Instruction * InsertPt) { + // + // Create iterators to the indices of the constant expression. + // + std::vector<Value *> Indices; + for (unsigned index = 1; index < CE->getNumOperands(); ++index) { + Indices.push_back (CE->getOperand (index)); + } + + // + // Update the statistics. + // + ++GEPChanges; + + // + // Make the new GEP instruction. + // + return (GetElementPtrInst::Create (CE->getOperand(0), + Indices, + CE->getName(), + InsertPt)); +} + +// +// Function: convertExpression() +// +// Description: +// Convert a constant expression into an instruction. This routine does *not* +// perform any recursion, so the resulting instruction may have constant +// expression operands. +// +static Instruction * +convertExpression (ConstantExpr * CE, Instruction * InsertPt) { + // + // Convert this constant expression into a regular instruction. + // + Instruction * NewInst = 0; + switch (CE->getOpcode()) { + case Instruction::GetElementPtr: { + NewInst = convertGEP (CE, InsertPt); + break; + } + + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Instruction::BinaryOps Op = (Instruction::BinaryOps)(CE->getOpcode()); + NewInst = BinaryOperator::Create (Op, + CE->getOperand(0), + CE->getOperand(1), + CE->getName(), + InsertPt); + break; + } + + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: { + Instruction::CastOps Op = (Instruction::CastOps)(CE->getOpcode()); + NewInst = CastInst::Create (Op, + CE->getOperand(0), + CE->getType(), + CE->getName(), + InsertPt); + break; + } + + case Instruction:: FCmp: + case Instruction:: ICmp: { + Instruction::OtherOps Op = (Instruction::OtherOps)(CE->getOpcode()); + NewInst = CmpInst::Create (Op, + CE->getPredicate(), + CE->getOperand(0), + CE->getOperand(1), + CE->getName(), + InsertPt); + break; + } + + case Instruction:: Select: + NewInst = SelectInst::Create (CE->getOperand(0), + CE->getOperand(1), + CE->getOperand(2), + CE->getName(), + InsertPt); + break; + + case Instruction:: ExtractElement: + case Instruction:: InsertElement: + case Instruction:: ShuffleVector: + case Instruction:: InsertValue: + default: + assert (0 && "Unhandled constant expression!\n"); + break; + } + + // + // Update the statistics. + // + ++TotalChanges; + + return NewInst; +} + +// +// Method: runOnFunction() +// +// Description: +// Entry point for this LLVM pass. +// +// Return value: +// true - The function was modified. +// false - The function was not modified. +// +bool +BreakConstantGEPs::runOnFunction (Function & F) { + + if (!pocl::Workgroup::isKernelToProcess(F)) return false; + + bool modified = false; + + // Worklist of values to check for constant GEP expressions + std::vector<Instruction *> Worklist; + + // + // Initialize the worklist by finding all instructions that have one or more + // operands containing a constant GEP expression. + // + for (Function::iterator BB = F.begin(); BB != F.end(); ++BB) { + for (BasicBlock::iterator i = BB->begin(); i != BB->end(); ++i) { + // + // Scan through the operands of this instruction. If it is a constant + // expression GEP, insert an instruction GEP before the instruction. + // + Instruction * I = i; + for (unsigned index = 0; index < I->getNumOperands(); ++index) { + if (hasConstantGEP (I->getOperand(index))) { + Worklist.push_back (I); + } + } + } + } + + // + // Determine whether we will modify anything. + // + if (Worklist.size()) modified = true; + + // + // While the worklist is not empty, take an item from it, convert the + // operands into instructions if necessary, and determine if the newly + // added instructions need to be processed as well. + // + while (Worklist.size()) { + Instruction * I = Worklist.back(); + Worklist.pop_back(); + + // + // Scan through the operands of this instruction and convert each into an + // instruction. Note that this works a little differently for phi + // instructions because the new instruction must be added to the + // appropriate predecessor block. + // + if (PHINode * PHI = dyn_cast<PHINode>(I)) { + for (unsigned index = 0; index < PHI->getNumIncomingValues(); ++index) { + // + // For PHI Nodes, if an operand is a constant expression with a GEP, we + // want to insert the new instructions in the predecessor basic block. + // + // Note: It seems that it's possible for a phi to have the same + // incoming basic block listed multiple times; this seems okay as long + // the same value is listed for the incoming block. + // + Instruction * InsertPt = PHI->getIncomingBlock(index)->getTerminator(); + if (ConstantExpr * CE = hasConstantGEP (PHI->getIncomingValue(index))) { + Instruction * NewInst = convertExpression (CE, InsertPt); + for (unsigned i2 = index; i2 < PHI->getNumIncomingValues(); ++i2) { + if ((PHI->getIncomingBlock (i2)) == PHI->getIncomingBlock (index)) + PHI->setIncomingValue (i2, NewInst); + } + Worklist.push_back (NewInst); + } + } + } else { + for (unsigned index = 0; index < I->getNumOperands(); ++index) { + // + // For other instructions, we want to insert instructions replacing + // constant expressions immediently before the instruction using the + // constant expression. + // + if (ConstantExpr * CE = hasConstantGEP (I->getOperand(index))) { + Instruction * NewInst = convertExpression (CE, I); + I->replaceUsesOfWith (CE, NewInst); + Worklist.push_back (NewInst); + } + } + } + } + + return modified; +} + + diff --git a/src/llvmopencl/BreakConstantGEPs.h b/src/llvmopencl/BreakConstantGEPs.h new file mode 100644 index 0000000..4cd86b2 --- /dev/null +++ b/src/llvmopencl/BreakConstantGEPs.h @@ -0,0 +1,57 @@ +//===- BreakConstantGEPs.h - Change constant GEPs into GEP instructions --- --// +// +// pocl note: This pass is taken from The SAFECode project with trivial modifications. +// Automatic locals might cause constant GEPs which cause problems during +// converting the locals to kernel function arguments for thread safety. +// +// The SAFECode Compiler +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass changes all GEP constant expressions into GEP instructions. This +// permits the rest of SAFECode to put run-time checks on them if necessary. +// +//===----------------------------------------------------------------------===// + +#ifndef BREAKCONSTANTGEPS_H +#define BREAKCONSTANTGEPS_H + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Module.h" +#else +#include "llvm/IR/Module.h" +#endif +#include "llvm/Analysis/Dominators.h" +#include "llvm/Pass.h" + +using namespace llvm; + +// +// Pass: BreakConstantGEPs +// +// Description: +// This pass modifies a function so that it uses GEP instructions instead of +// GEP constant expressions. +// +struct BreakConstantGEPs : public FunctionPass { + private: + // Private methods + + // Private variables + + public: + static char ID; + BreakConstantGEPs() : FunctionPass(ID) {} + const char *getPassName() const {return "Remove Constant GEP Expressions";} + virtual bool runOnFunction (Function & F); + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + // This pass does not modify the control-flow graph of the function + AU.setPreservesCFG(); + } +}; + +#endif diff --git a/src/llvmopencl/CanonicalizeBarriers.cc b/src/llvmopencl/CanonicalizeBarriers.cc new file mode 100644 index 0000000..409e264 --- /dev/null +++ b/src/llvmopencl/CanonicalizeBarriers.cc @@ -0,0 +1,214 @@ +// LLVM function pass to canonicalize barriers. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// 2012 Pekka Jääskeläinen / Tampere University of Technology +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include "CanonicalizeBarriers.h" +#include "BarrierBlock.h" +#include "Barrier.h" +#include "Workgroup.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <iostream> + +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#endif + +using namespace llvm; +using namespace pocl; + +namespace { + static + RegisterPass<CanonicalizeBarriers> X("barriers", + "Barrier canonicalization pass"); +} + +char CanonicalizeBarriers::ID = 0; + +void +CanonicalizeBarriers::getAnalysisUsage(AnalysisUsage &AU) const +{ +} + +bool +CanonicalizeBarriers::runOnFunction(Function &F) +{ + if (!Workgroup::isKernelToProcess(F)) + return false; + + BasicBlock *entry = &F.getEntryBlock(); + if (!isa<BarrierBlock>(entry)) { + BasicBlock *effective_entry = SplitBlock(entry, + &(entry->front()), + this); + effective_entry->takeName(entry); + entry->setName("entry.barrier"); + Barrier::Create(entry->getTerminator()); + } + + for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { + BasicBlock *b = i; + TerminatorInst *t = b->getTerminator(); + if ((t->getNumSuccessors() == 0) && (!isa<BarrierBlock>(b))) { + /* In case the bb is already terminated with a barrier, + split before the barrier so we dot create an empty + parallel region. + + This is because the assumptions of the other passes in the + compilation that are + a) exit node is a barrier block + b) there are no empty parallel regions (which would be formed + between the explicit barrier and the added one). */ + BasicBlock *exit; + if (Barrier::endsWithBarrier(b)) + exit = SplitBlock(b, t->getPrevNode(), this); + else + exit = SplitBlock(b, t, this); + exit->setName("exit.barrier"); + Barrier::Create(t); + } + } + + DT = getAnalysisIfAvailable<DominatorTree>(); + LI = getAnalysisIfAvailable<LoopInfo>(); + + bool changed = ProcessFunction(F); + + if (DT) + DT->verifyAnalysis(); + if (LI) + LI->verifyAnalysis(); + + return changed; +} + + +// Canonicalize barriers: ensure all barriers are in a separate BB +// containing only the barrier and the terminator, with just one +// predecessor and one successor. This allows us to use +// those BBs as markers only, they will not be replicated. +bool +CanonicalizeBarriers::ProcessFunction(Function &F) +{ + bool changed = false; + + InstructionSet Barriers; + + for (Function::iterator i = F.begin(), e = F.end(); + i != e; ++i) + { + BasicBlock *b = i; + for (BasicBlock::iterator i = b->begin(), e = b->end(); + i != e; ++i) + { + if (isa<Barrier>(i)) + { + Barriers.insert(i); + } + } + } + + // Finally add all the split points, now that we are done with the + // iterators. + for (InstructionSet::iterator i = Barriers.begin(), e = Barriers.end(); + i != e; ++i) { + BasicBlock *b = (*i)->getParent(); + + // Split post barrier first cause it does not make the barrier + // to belong to another basic block. + TerminatorInst *t = b->getTerminator(); + // if ((t->getNumSuccessors() > 1) || + // (t->getPrevNode() != *i)) { + // Change: barriers with several successors are all right + // they just start several parallel regions. Simplifies + // loop handling. + + const bool HAS_NON_BRANCH_INSTRUCTIONS_AFTER_BARRIER = + t->getPrevNode() != *i; + + if (HAS_NON_BRANCH_INSTRUCTIONS_AFTER_BARRIER) { + BasicBlock *new_b = SplitBlock(b, (*i)->getNextNode(), this); + new_b->setName(b->getName() + ".postbarrier"); + changed = true; + } + + BasicBlock *predecessor = b->getSinglePredecessor(); + if (predecessor != NULL) { + TerminatorInst *pt = predecessor->getTerminator(); + if ((pt->getNumSuccessors() == 1) && + (&b->front() == (*i))) { + // Barrier is at the beginning of the BB, + // which has a single predecessor with just + // one successor (the barrier itself), thus + // no need to split before barrier. + continue; + } + } + if ((b == &(b->getParent()->getEntryBlock())) && + (&b->front() == (*i))) + continue; + + // If no instructions before barrier, do not split + // (allow multiple predecessors, eases loop handling). + // if (&b->front() == (*i)) + // continue; + BasicBlock *new_b = SplitBlock(b, *i, this); + new_b->takeName(b); + b->setName(new_b->getName() + ".prebarrier"); + changed = true; + } + + /* Prune empty regions. That is, if there are two successive + barriers, remove the other one. */ + bool emptyRegionDeleted = false; + do { + emptyRegionDeleted = false; + for (Function::iterator i = F.begin(), e = F.end(); + i != e; ++i) + { + BasicBlock *b = i; + llvm::TerminatorInst *t = b->getTerminator(); + if (!Barrier::endsWithBarrier(b) || t->getNumSuccessors() != 1) continue; + + BasicBlock *successor = t->getSuccessor(0); + + if (Barrier::hasOnlyBarrier(successor) && + successor->getSinglePredecessor() == b && + successor->getTerminator()->getNumSuccessors() == 1) + { + b->getTerminator()->setSuccessor(0, successor->getTerminator()->getSuccessor(0)); + successor->replaceAllUsesWith(b); + successor->eraseFromParent(); + emptyRegionDeleted = true; + changed = true; + break; + } + } + } while (emptyRegionDeleted); + + + return changed; +} diff --git a/src/llvmopencl/CanonicalizeBarriers.h b/src/llvmopencl/CanonicalizeBarriers.h new file mode 100644 index 0000000..047db1d --- /dev/null +++ b/src/llvmopencl/CanonicalizeBarriers.h @@ -0,0 +1,56 @@ +// Header for CanonicalizeBarriers.cc function pass. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Function.h" +#else +#include "llvm/IR/Function.h" +#endif +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Pass.h" +#include <set> + +namespace pocl { + class Workgroup; + + class CanonicalizeBarriers : public llvm::FunctionPass { + + public: + static char ID; + + CanonicalizeBarriers() : FunctionPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnFunction(llvm::Function &F); + + private: + typedef std::set<llvm::Instruction *> InstructionSet; + + llvm::LoopInfo *LI; + llvm::DominatorTree *DT; + + bool ProcessFunction(llvm::Function &F); + + friend class pocl::Workgroup; + }; +} diff --git a/src/llvmopencl/Flatten.cc b/src/llvmopencl/Flatten.cc new file mode 100644 index 0000000..2e01f2a --- /dev/null +++ b/src/llvmopencl/Flatten.cc @@ -0,0 +1,158 @@ +// LLVM module pass to inline required functions (those accessing +// per-workgroup variables) into the kernel. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "Flatten.h" +using namespace pocl; + +extern cl::opt<std::string> KernelName; + +char Flatten::ID = 0; +static RegisterPass<Flatten> X("flatten", "Kernel function flattening pass"); + +static const char *workgroup_variables[] = { + "_local_id_x", "_local_id_y", "_local_id_z", + "_local_size_x", "_local_size_y", "_local_size_z", + "_work_dim", + "_num_groups_x", "_num_groups_y", "_num_groups_z", + "_group_id_x", "_group_id_y", "_group_id_z", + "_global_offset_x", "_global_offset_y", "_global_offset_z", + NULL}; + +//#define DEBUG_FLATTEN + +#define INLINE_ALL_NON_KERNEL + +#ifdef INLINE_ALL_NON_KERNEL + +bool +Flatten::runOnModule(Module &M) +{ + bool changed = false; + for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i) + { + llvm::Function *f = i; + if (f->isDeclaration()) continue; + if (KernelName == f->getName() || + (KernelName == "" && pocl::Workgroup::isKernelToProcess(*f))) + { +#ifdef LLVM_3_1 + f->removeFnAttr(Attribute::AlwaysInline); + f->addFnAttr(Attribute::NoInline); +#elif defined LLVM_3_2 + AttrBuilder b; + f->removeFnAttr(Attributes::get(M.getContext(), b.addAttribute(Attributes::AlwaysInline))); + f->addFnAttr(Attributes::NoInline); +#else + AttributeSet attrs; + f->removeAttributes( + AttributeSet::FunctionIndex, + attrs.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::AlwaysInline)); + + f->addFnAttr(Attribute::NoInline); +#endif + + f->setLinkage(llvm::GlobalValue::ExternalLinkage); + changed = true; +#ifdef DEBUG_FLATTEN + std::cerr << "### NoInline for " << f->getName().str() << std::endl; +#endif + } + else + { +#ifdef LLVM_3_1 + f->removeFnAttr(Attribute::NoInline); + f->addFnAttr(Attribute::AlwaysInline); +#elif defined LLVM_3_2 + AttrBuilder b; + f->removeFnAttr(Attributes::get(M.getContext(), b.addAttribute(Attributes::NoInline))); + f->addFnAttr(Attributes::AlwaysInline); +#else + AttributeSet attrs; + f->removeAttributes( + AttributeSet::FunctionIndex, + attrs.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::NoInline)); + f->addFnAttr(Attribute::AlwaysInline); +#endif + + f->setLinkage(llvm::GlobalValue::InternalLinkage); + changed = true; +#ifdef DEBUG_FLATTEN + std::cerr << "### AlwaysInline for " << f->getName().str() << std::endl; +#endif + } + } + return changed; +} + +#else + +bool +Flatten::runOnModule(Module &M) +{ + SmallPtrSet<Function *, 8> functions_to_inline; + SmallVector<Value *, 8> pending; + + const char **s = workgroup_variables; + while (*s != NULL) { + GlobalVariable *gv = M.getGlobalVariable(*s); + if (gv != NULL) + pending.push_back(gv); + + ++s; + } + + while (!pending.empty()) { + Value *v = pending.back(); + pending.pop_back(); + + for (Value::use_iterator i = v->use_begin(), e = v->use_end(); + i != e; ++i) { + if (Instruction *ci = dyn_cast<Instruction>(*i)) { + // Prevent infinite looping on recursive functions + // (though OpenCL does not allow this?) + Function *f = ci->getParent()->getParent();; + assert((f != NULL) && + "Per-workgroup global variable used on function with no parent!"); + if (functions_to_inline.count(f)) + continue; + + functions_to_inline.insert(f); + pending.push_back(f); + } + } + } + + for (SmallPtrSet<Function *, 8>::iterator i = functions_to_inline.begin(), + e = functions_to_inline.end(); + i != e; ++i) { + (*i)->removeFnAttr(Attribute::NoInline); + (*i)->addFnAttr(Attribute::AlwaysInline); + } + + return true; +} + +#endif + + diff --git a/src/llvmopencl/Flatten.h b/src/llvmopencl/Flatten.h new file mode 100644 index 0000000..df3a174 --- /dev/null +++ b/src/llvmopencl/Flatten.h @@ -0,0 +1,51 @@ +// LLVM module pass to inline required functions (those accessing +// per-workgroup variables) into the kernel. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include <iostream> +#include <string> +#include "Workgroup.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Pass.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Module.h" +#else +#include "llvm/IR/Module.h" +#endif + +using namespace llvm; + +namespace pocl { + class Flatten : public ModulePass { + + public: + static char ID; + Flatten() : ModulePass(ID) {} + + virtual bool runOnModule(Module &M); + }; + +} + diff --git a/src/llvmopencl/GenerateHeader.cc b/src/llvmopencl/GenerateHeader.cc new file mode 100644 index 0000000..55a5bbe --- /dev/null +++ b/src/llvmopencl/GenerateHeader.cc @@ -0,0 +1,336 @@ +// LLVM module pass to get information from kernel functions. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include "pocl.h" +#include "Workgroup.h" +#include "llvm/Pass.h" +#include "llvm/PassSupport.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#ifdef LLVM_3_1 +#include "llvm/Target/TargetData.h" +#elif defined LLVM_3_2 +#include "llvm/DataLayout.h" +#else +#include "llvm/IR/DataLayout.h" +#endif + +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Argument.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/Argument.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#endif + +#include "LLVMUtils.h" + +using namespace std; +using namespace llvm; +using namespace pocl; + + +cl::opt<string> +Header("header", + cl::desc("Output header file with kernel description macros"), + cl::value_desc("header")); + +namespace { + class GenerateHeader : public ModulePass { + + public: + static char ID; + GenerateHeader() : ModulePass(ID) {} + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool runOnModule(Module &M); + + private: + void ProcessPointers(Function *F, + raw_fd_ostream &out); + void ProcessReqdWGSize(Function *F, + raw_fd_ostream &out); + Function *ProcessAutomaticLocals(Function *F, + raw_fd_ostream &out); + }; +} + +char GenerateHeader::ID = 0; +static RegisterPass<GenerateHeader> X("generate-header", + "Kernel information header creation pass"); + +void +GenerateHeader::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<DataLayout>(); +} + +bool +GenerateHeader::runOnModule(Module &M) +{ + bool changed = false; + + // store the new and old kernel pairs in order to regenerate + // all the metadata that used to point to the unmodified + // kernels + FunctionMapping kernels; + + string ErrorInfo; + raw_fd_ostream out(Header.c_str(), ErrorInfo, raw_fd_ostream::F_Append); + + for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) { + if (!Workgroup::isKernelToProcess(*mi)) + continue; + + Function *F = mi; + + ProcessPointers(F, out); + ProcessReqdWGSize(F, out); + + Function *new_kernel = ProcessAutomaticLocals(F, out); + if (new_kernel != F) + changed = true; + kernels[F] = new_kernel; + } + + if (changed) + { + regenerate_kernel_metadata(M, kernels); + + /* Delete the old kernels. */ + for (FunctionMapping::const_iterator i = kernels.begin(), + e = kernels.end(); i != e; ++i) + { + Function *old_kernel = (*i).first; + Function *new_kernel = (*i).second; + if (old_kernel == new_kernel) continue; + old_kernel->eraseFromParent(); + } + } + return changed; +} + +#include <iostream> + +void +GenerateHeader::ProcessReqdWGSize(Function *F, + raw_fd_ostream &out) +{ + + unsigned LocalSizeX = 0, LocalSizeY = 0, LocalSizeZ = 0; + + llvm::NamedMDNode *size_info = F->getParent()->getNamedMetadata("opencl.kernel_wg_size_info"); + if (size_info) { + for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) { + llvm::MDNode *KernelSizeInfo = size_info->getOperand(i); + if (KernelSizeInfo->getOperand(0) == F) { + LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue(); + LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue(); + LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue(); + } + } + } + + out << "#define _" << F->getName() << "_REQD_WG_SIZE {" + << LocalSizeX << ", " + << LocalSizeY << ", " + << LocalSizeZ << "}\n"; +} + + +void +GenerateHeader::ProcessPointers(Function *F, + raw_fd_ostream &out) +{ + int num_args = F->getFunctionType()->getNumParams(); + + out << "#define _" << F->getName() << "_NUM_ARGS " << num_args << '\n'; + + bool is_pointer[num_args]; + bool is_local[num_args]; + bool is_image[num_args]; + bool is_sampler[num_args]; + + int i = 0; + for (Function::const_arg_iterator ii = F->arg_begin(), + ee = F->arg_end(); + ii != ee; ++ii) { + Type *t = ii->getType(); + + is_image[i] = false; + is_sampler[i] = false; + + const PointerType *p = dyn_cast<PointerType>(t); + if (p && !ii->hasByValAttr()) { + is_pointer[i] = true; + // index 0 is for function attributes, parameters start at 1. + if (p->getAddressSpace() == POCL_ADDRESS_SPACE_GLOBAL || + p->getAddressSpace() == POCL_ADDRESS_SPACE_CONSTANT) + is_local[i] = false; + else + is_local[i] = true; + } else { + is_pointer[i] = false; + is_local[i] = false; + } + + if (t->isPointerTy()) { + if (t->getPointerElementType()->isStructTy()) { + string name = t->getPointerElementType()->getStructName().str(); + if (name == "struct.image2d_t_") { // TODO image3d? + is_image[i] = true; + is_pointer[i] = false; + is_local[i] = false; + } + if (name == "struct.sampler_t_") { + is_sampler[i] = true; + is_pointer[i] = false; + is_local[i] = false; + } + } + } + + ++i; + } + + out << "#define _" << F->getName() << "_ARG_IS_POINTER {"; + if (num_args != 0) { + out << is_pointer[0]; + for (i = 1; i < num_args; ++i) + out << ", " << is_pointer[i]; + } + out << "}\n"; + + out << "#define _" << F->getName() << "_ARG_IS_LOCAL {"; + if (num_args != 0) { + out << is_local[0]; + for (i = 1; i < num_args; ++i) + out << ", " << is_local[i]; + } + out << "}\n"; + + out << "#define _" << F->getName() << "_ARG_IS_IMAGE {"; + if (num_args != 0) { + out << is_image[0]; + for (i = 1; i < num_args; ++i) + out << ", " << is_image[i]; + } + out << "}\n"; + + out << "#define _" << F->getName() << "_ARG_IS_SAMPLER {"; + if (num_args != 0) { + out << is_sampler[0]; + for (i = 1; i < num_args; ++i) + out << ", " << is_sampler[i]; + } + out << "}\n"; +} + + +Function * +GenerateHeader::ProcessAutomaticLocals(Function *F, + raw_fd_ostream &out) +{ + Module *M = F->getParent(); + DataLayout &TD = getAnalysis<DataLayout>(); + + SmallVector<GlobalVariable *, 8> locals; + + SmallVector<Type *, 8> parameters; + for (Function::const_arg_iterator i = F->arg_begin(), + e = F->arg_end(); + i != e; ++i) + parameters.push_back(i->getType()); + + for (Module::global_iterator i = M->global_begin(), + e = M->global_end(); + i != e; ++i) { + std::string funcName = ""; + funcName = F->getName().str(); + if (i->getName().startswith(funcName + ".")) { + // Additional checks might be needed here. For now + // we assume any global starting with kernel name + // is declaring a local variable. + locals.push_back(i); + // Add the parameters to the end of the function parameter list. + parameters.push_back(i->getType()); + } + } + + out << "#define _" << F->getName() << "_NUM_LOCALS "<< locals.size() << "\n"; + out << "#define _" << F->getName() << "_LOCAL_SIZE {"; + if (!locals.empty()) { + out << TD.getTypeAllocSize(locals[0]->getInitializer()->getType()); + for (unsigned i = 1; i < locals.size(); ++i) + out << ", " << TD.getTypeAllocSize(locals[i]->getInitializer()->getType()); + } + out << "}\n"; + + if (locals.empty()) { + // This kernel fingerprint has not changed. + return F; + } + + // Create the new function. + FunctionType *ft = FunctionType::get(F->getReturnType(), + parameters, + F->isVarArg()); + Function *new_kernel = Function::Create(ft, + F->getLinkage(), + "", + M); + new_kernel->takeName(F); + + ValueToValueMapTy vv; + Function::arg_iterator j = new_kernel->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), + e = F->arg_end(); + i != e; ++i) { + j->setName(i->getName()); + vv[i] = j; + ++j; + } + + for (int i = 0; j != new_kernel->arg_end(); ++i, ++j) { + j->setName("_local" + Twine(i)); + vv[locals[i]] = j; + } + + SmallVector<ReturnInst *, 1> ri; + CloneFunctionInto(new_kernel, F, vv, false, ri); + + return new_kernel; +} + diff --git a/src/llvmopencl/ImplicitLoopBarriers.cc b/src/llvmopencl/ImplicitLoopBarriers.cc new file mode 100644 index 0000000..66dcdb3 --- /dev/null +++ b/src/llvmopencl/ImplicitLoopBarriers.cc @@ -0,0 +1,178 @@ +// LLVM function pass that adds implicit barriers to loops if it sees +// beneficial. +// +// Copyright (c) 2012-2014 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include "ImplicitLoopBarriers.h" +#include "Barrier.h" +#include "Workgroup.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#endif + +#include "VariableUniformityAnalysis.h" + +#include <iostream> + +//#define DEBUG_ILOOP_BARRIERS + +using namespace llvm; +using namespace pocl; + +namespace { + static + RegisterPass<ImplicitLoopBarriers> X("implicit-loop-barriers", + "Adds implicit barriers to loops"); +} + +char ImplicitLoopBarriers::ID = 0; + +void +ImplicitLoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<VariableUniformityAnalysis>(); + AU.addPreserved<VariableUniformityAnalysis>(); +} + +bool +ImplicitLoopBarriers::runOnLoop(Loop *L, LPPassManager &LPM) +{ + if (!Workgroup::isKernelToProcess(*L->getHeader()->getParent())) + return false; + + return ProcessLoop(L, LPM); +} + + +/** + * Adds a barrier to the first BB of each loop. + * + * Note: it's not safe to do this in case the loop is not executed + * by all work items. Therefore this is not enabled by default. + */ +bool +ImplicitLoopBarriers::ProcessLoop(Loop *L, LPPassManager &LPM) +{ + + bool isBLoop = false; + for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); + i != e && !isBLoop; ++i) { + for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); + j != e; ++j) { + if (isa<Barrier>(j)) { + isBLoop = true; + break; + } + } + } + if (isBLoop) return false; + + return AddInnerLoopBarrier(L, LPM); +} + +/** + * Adds a barrier to the beginning of the loop body to force its treatment + * similarly to a loop with work-group barriers. + * + * This allows parallelizing work-items across the work-group per kernel + * for-loop iteration, potentially leading to easier horizontal vectorization. + * The idea is similar to loop switching where the work-item loop is + * switched with the kernel for-loop. + * + * We need to make sure it is legal to add the barrier, though. The + * OpenCL barrier semantics require either all or none of the WIs to + * reach the barrier at each iteration. This is satisfied at least when + * + * a) loop exit condition does not depend on the WI and + * b) all or none of the WIs always enter the loop + */ +bool +ImplicitLoopBarriers::AddInnerLoopBarrier(llvm::Loop *L, llvm::LPPassManager &LPM) { + + /* Only add barriers to the innermost loops. */ + + if (L->getSubLoops().size() > 0) + return false; + +#ifdef DEBUG_ILOOP_BARRIERS + std::cerr << "### trying to add a loop barrier to force horizontal parallelization" + << std::endl; +#endif + + BasicBlock *brexit = L->getExitingBlock(); + if (brexit == NULL) return false; /* Multiple exit points */ + + llvm::BasicBlock *loopEntry = L->getHeader(); + if (loopEntry == NULL) return false; /* Multiple entries blocks? */ + + llvm::Function *f = brexit->getParent(); + + VariableUniformityAnalysis &VUA = + getAnalysis<VariableUniformityAnalysis>(); + + /* Check if the whole loop construct is executed by all or none of the + work-items. */ + if (!VUA.isUniform(f, loopEntry)) { +#ifdef DEBUG_ILOOP_BARRIERS + std::cerr << "### the loop is not uniform because loop entry '" + << loopEntry->getName().str() << "' is not uniform" << std::endl; + +#endif + return false; + } + + /* Check the branch condition predicate. If it is uniform, we know the loop + is executed the same number of times for all WIs. */ + llvm::BranchInst *br = dyn_cast<llvm::BranchInst>(brexit->getTerminator()); + if (br && br->isConditional() && + VUA.isUniform(f, br->getCondition())) { + + Barrier::Create(brexit->getTerminator()); +#ifdef DEBUG_ILOOP_BARRIERS + std::cerr << "### added an inner-loop barrier to the loop" << std::endl << std::endl; +#endif + return true; + } else { +#ifdef DEBUG_ILOOP_BARRIERS + if (br && br->isConditional() && !VUA.isUniform(f, br->getCondition())) { + std::cerr << "### loop condition not uniform" << std::endl; + br->getCondition()->dump(); + } +#endif + + } + +#ifdef DEBUG_ILOOP_BARRIERS + std::cerr << "### cannot add an inner-loop barrier to the loop" << std::endl << std::endl; +#endif + + return false; +} diff --git a/src/llvmopencl/ImplicitLoopBarriers.h b/src/llvmopencl/ImplicitLoopBarriers.h new file mode 100644 index 0000000..e31a134 --- /dev/null +++ b/src/llvmopencl/ImplicitLoopBarriers.h @@ -0,0 +1,44 @@ +// Header for ImplicitLoopBarriers loop pass. +// +// Copyright (c) 2012-2014 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "llvm/Analysis/LoopPass.h" +#include <set> + +namespace pocl { + class ImplicitLoopBarriers : public llvm::LoopPass { + + public: + static char ID; + + ImplicitLoopBarriers() : LoopPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnLoop(llvm::Loop *L, llvm::LPPassManager &LPM); + + private: + llvm::DominatorTree *DT; + + bool ProcessLoop(llvm::Loop *L, llvm::LPPassManager &LPM); + bool AddInnerLoopBarrier(llvm::Loop *L, llvm::LPPassManager &LPM); + + }; +} diff --git a/src/llvmopencl/IsolateRegions.cc b/src/llvmopencl/IsolateRegions.cc new file mode 100644 index 0000000..b370aa4 --- /dev/null +++ b/src/llvmopencl/IsolateRegions.cc @@ -0,0 +1,175 @@ +// Header for IsolateRegions RegionPass. +// +// Copyright (c) 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "IsolateRegions.h" +#include "Barrier.h" +#include "Workgroup.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "config.h" + +#include <iostream> + +//#define DEBUG_ISOLATE_REGIONS +using namespace llvm; +using namespace pocl; + +namespace { + static + RegisterPass<IsolateRegions> X("isolate-regions", + "Single-Entry Single-Exit region isolation pass."); +} + +char IsolateRegions::ID = 0; + +void +IsolateRegions::getAnalysisUsage(AnalysisUsage &AU) const +{ +} + +/* Ensure Single-Entry Single-Exit Regions are isolated from the + exit node so they won't get split illegally with tail replication. + + This might happen in case an if .. else .. structure is just + before an exit from kernel. Both branches are split even though + we would like to replicate the structure as a whole to retain + semantics. This adds dummy basic blocks to all Regions just for + clarity. Cleanup with -simplifycfg. + + TODO: Also add a dummy BB in case the Region starts with a + barrier. Such a Region might not get optimally replicated and + can lead to problematic cases. E.g.: + + digraph G { + BAR1 -> A; + A -> X; + BAR1 -> X; + X -> BAR2; + } + + (draw with "dot -Tpng -o graph.png" + copy paste the above) + + Here you have a structure which should be replicated fully but + it won't as the Region starts with a barrier at a split point + BB, thus it tries to replicate both of the branches which lead + to interesting errors and is not supported. Another option would + be to tail replicate both of the branches, but currently tail + replication is done only starting from the exit nodes. + + IsolateRegions "normalizes" the graph to: + + digraph G { + BAR1 -> r_entry; + r_entry -> A; + A -> X; + r_entry -> X; + X -> BAR2; + } + + +*/ +bool +IsolateRegions::runOnRegion(Region *R, llvm::RGPassManager&) +{ + llvm::BasicBlock *exit = R->getExit(); + if (exit == NULL) return false; + +#ifdef DEBUG_ISOLATE_REGIONS + std::cerr << "### processing region:" << std::endl; + R->dump(); + std::cerr << "### exit block:" << std::endl; + exit->dump(); +#endif + bool isFunctionExit = exit->getTerminator()->getNumSuccessors() == 0; + + bool changed = false; + + if (Barrier::hasBarrier(exit) || isFunctionExit) + { + addDummyBefore(R, exit); + changed = true; + } + + llvm::BasicBlock *entry = R->getEntry(); + if (entry == NULL) return changed; + + bool isFunctionEntry = &entry->getParent()->getEntryBlock() == entry; + + if (Barrier::hasBarrier(entry) || isFunctionEntry) + { + addDummyAfter(R, entry); + changed = true; + } + + return changed; +} + + +/** + * Adds a dummy node after the given basic block. + */ +void +IsolateRegions::addDummyAfter(llvm::Region *R, llvm::BasicBlock *bb) +{ + std::vector< llvm::BasicBlock* > regionSuccs; + + for (llvm::succ_iterator i = succ_begin(bb), e = succ_end(bb); + i != e; ++i) { + llvm::BasicBlock* succ = *i; + if (R->contains(succ)) + regionSuccs.push_back(succ); + } + llvm::BasicBlock* newEntry = + SplitBlock(bb, bb->getTerminator(), this); + newEntry->setName(bb->getName() + ".r_entry"); + R->replaceEntry(newEntry); + +} + +/** + * Adds a dummy node before the given basic block. + * + * The edges going in to the original BB are moved to go + * in to the dummy BB in case the source BB is inside the + * same region. + */ +void +IsolateRegions::addDummyBefore(llvm::Region *R, llvm::BasicBlock *bb) +{ + std::vector< llvm::BasicBlock* > regionPreds; + + for (pred_iterator i = pred_begin(bb), e = pred_end(bb); + i != e; ++i) { + llvm::BasicBlock* pred = *i; + if (R->contains(pred)) + regionPreds.push_back(pred); + } +#ifdef LLVM_3_0 + llvm::BasicBlock* newExit = + SplitBlockPredecessors + (bb, ®ionPreds[0], regionPreds.size(), ".r_exit", this); +#else + llvm::BasicBlock* newExit = + SplitBlockPredecessors(bb, regionPreds, ".r_exit", this); +#endif + R->replaceExit(newExit); +} diff --git a/src/llvmopencl/IsolateRegions.h b/src/llvmopencl/IsolateRegions.h new file mode 100644 index 0000000..62f6a29 --- /dev/null +++ b/src/llvmopencl/IsolateRegions.h @@ -0,0 +1,44 @@ +// Header for IsolateRegions RegionPass. +// +// Copyright (c) 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef POCL_ISOLATE_REGIONS_H +#define POCL_ISOLATE_REGIONS_H + +#include "llvm/Analysis/RegionPass.h" + +namespace pocl { + + class IsolateRegions : public llvm::RegionPass { + public: + static char ID; + + IsolateRegions() : RegionPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnRegion(llvm::Region *R, llvm::RGPassManager&); + void addDummyAfter(llvm::Region *R, llvm::BasicBlock *bb); + void addDummyBefore(llvm::Region *R, llvm::BasicBlock *bb); + + }; +} + +#endif diff --git a/src/llvmopencl/Kernel.cc b/src/llvmopencl/Kernel.cc new file mode 100644 index 0000000..03e08b8 --- /dev/null +++ b/src/llvmopencl/Kernel.cc @@ -0,0 +1,297 @@ +// Class for kernels, llvm::Functions that represent OpenCL C kernels. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos and +// 2012 Pekka Jääskeläinen / TUT +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "Kernel.h" +#include "Barrier.h" +#include <iostream> + +#include "config.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#else +#include "llvm/IR/IRBuilder.h" +#endif + +//#define DEBUG_PR_CREATION + +using namespace llvm; +using namespace pocl; + +static void add_predecessors(SmallVectorImpl<BasicBlock *> &v, + BasicBlock *b); +static bool verify_no_barriers(const BasicBlock *B); + +void +Kernel::getExitBlocks(SmallVectorImpl<BarrierBlock *> &B) +{ + for (iterator i = begin(), e = end(); i != e; ++i) { + const TerminatorInst *t = i->getTerminator(); + if (t->getNumSuccessors() == 0) { + // All exits must be barrier blocks. + B.push_back(cast<BarrierBlock>(i)); + } + } +} + +ParallelRegion * +Kernel::createParallelRegionBefore(BarrierBlock *B) +{ + SmallVector<BasicBlock *, 4> pending_blocks; + SmallPtrSet<BasicBlock *, 8> blocks_in_region; + BarrierBlock *region_entry_barrier = NULL; + llvm::BasicBlock *entry = NULL; + llvm::BasicBlock *exit = B->getSinglePredecessor(); + add_predecessors(pending_blocks, B); + +#ifdef DEBUG_PR_CREATION + std::cerr << "createParallelRegionBefore " << B->getName().str() << std::endl; +#endif + + while (!pending_blocks.empty()) { + BasicBlock *current = pending_blocks.back(); + pending_blocks.pop_back(); + +#ifdef DEBUG_PR_CREATION + std::cerr << "considering " << current->getName().str() << std::endl; +#endif + + // avoid infinite recursion of loops + if (blocks_in_region.count(current) != 0) + { +#ifdef DEBUG_PR_CREATION + std::cerr << "already in the region!" << std::endl; +#endif + continue; + } + + // If we reach another barrier this must be the + // parallel region entry. + if (isa<BarrierBlock>(current)) { + if (region_entry_barrier == NULL) + region_entry_barrier = cast<BarrierBlock>(current); +#ifdef DEBUG_PR_CREATION + std::cerr << "### it's a barrier!" << std::endl; +#endif + continue; + } + + + if (!verify_no_barriers(current)) + { + assert(verify_no_barriers(current) && + "Barrier found in a non-barrier block! (forgot barrier canonicalization?)"); + } + +#ifdef DEBUG_PR_CREATION + std::cerr << "added it to the region" << std::endl; +#endif + // Non-barrier block, this must be on the region. + blocks_in_region.insert(current); + + // Add predecessors to pending queue. + add_predecessors(pending_blocks, current); + } + + if (blocks_in_region.empty()) + return NULL; + + // Find the entry node. + assert (region_entry_barrier != NULL); + for (unsigned suc = 0, num = region_entry_barrier->getTerminator()->getNumSuccessors(); + suc < num; ++suc) + { + llvm::BasicBlock *entryCandidate = + region_entry_barrier->getTerminator()->getSuccessor(suc); + if (blocks_in_region.count(entryCandidate) == 0) + continue; + entry = entryCandidate; + break; + } + assert (blocks_in_region.count(entry) != 0); + + // We got all the blocks in a region, create it. + return ParallelRegion::Create(blocks_in_region, entry, exit); +} + +static void +add_predecessors(SmallVectorImpl<BasicBlock *> &v, BasicBlock *b) +{ + for (pred_iterator i = pred_begin(b), e = pred_end(b); + i != e; ++i) { + if ((isa<BarrierBlock> (*i)) && isa<BarrierBlock> (b)) { + // Ignore barrier-to-barrier edges * Why? --Pekka + add_predecessors(v, *i); + continue; + } + v.push_back(*i); + } +} + +static bool +verify_no_barriers(const BasicBlock *B) +{ + for (BasicBlock::const_iterator i = B->begin(), e = B->end(); i != e; ++i) { + if (isa<Barrier>(i)) + return false; + } + + return true; +} + +ParallelRegion::ParallelRegionVector * +Kernel::getParallelRegions(llvm::LoopInfo *LI) { + ParallelRegion::ParallelRegionVector *parallel_regions = + new ParallelRegion::ParallelRegionVector; + + SmallVector<BarrierBlock *, 4> exit_blocks; + getExitBlocks(exit_blocks); + + // We need to keep track of traversed barriers to detect back edges. + SmallPtrSet<BarrierBlock *, 8> found_barriers; + + // First find all the ParallelRegions in the Function. + while (!exit_blocks.empty()) { + + // We start on an exit block and process the parallel regions upwards + // (finding an execution trace). + BarrierBlock *exit = exit_blocks.back(); + exit_blocks.pop_back(); + + while (ParallelRegion *PR = createParallelRegionBefore(exit)) { + assert(PR != NULL && !PR->empty() && + "Empty parallel region in kernel (contiguous barriers)!"); + + found_barriers.insert(exit); + exit = NULL; + parallel_regions->push_back(PR); + BasicBlock *entry = PR->entryBB(); + int found_predecessors = 0; + BarrierBlock *loop_barrier = NULL; + for (pred_iterator i = pred_begin(entry), e = pred_end(entry); + i != e; ++i) { + BarrierBlock *barrier = cast<BarrierBlock> (*i); + if (!found_barriers.count(barrier)) { + /* If this is a loop header block we might have edges from two + unprocessed barriers. The one inside the loop (coming from a + computation block after a branch block) should be processed + first. */ + std::string bbName = ""; + const bool IS_IN_THE_SAME_LOOP = + LI->getLoopFor(barrier) != NULL && + LI->getLoopFor(entry) != NULL && + LI->getLoopFor(entry) == LI->getLoopFor(barrier); + + if (IS_IN_THE_SAME_LOOP) + { +#ifdef DEBUG_PR_CREATION + std::cout << "### found a barrier inside the loop:" << std::endl; + std::cout << barrier->getName().str() << std::endl; +#endif + if (loop_barrier != NULL) { + // there can be multiple latches and each have their barrier, + // save the previously found inner loop barrier + exit_blocks.push_back(loop_barrier); + } + loop_barrier = barrier; + } + else + { +#ifdef DEBUG_PR_CREATION + std::cout << "### found a barrier:" << std::endl; + std::cout << barrier->getName().str() << std::endl; +#endif + exit = barrier; + } + ++found_predecessors; + } + } + + if (loop_barrier != NULL) + { + /* The secondary barrier to process in case it was a loop + header. Push it for later processing. */ + if (exit != NULL) + exit_blocks.push_back(exit); + /* always process the inner loop regions first */ + if (!found_barriers.count(loop_barrier)) + exit = loop_barrier; + } + +#ifdef DEBUG_PR_CREATION + std::cout << "### created a ParallelRegion:" << std::endl; + PR->dumpNames(); + std::cout << std::endl; +#endif + + if (found_predecessors == 0) + { + /* This path has been traversed and we encountered no more + unprocessed regions. It means we have either traversed all + paths from the exit or have transformed a loop and thus + encountered only a barrier that was seen (and thus + processed) before. */ + break; + } + assert ((exit != NULL) && "Parallel region without entry barrier!"); + } + } + return parallel_regions; + +} + +void +Kernel::addLocalSizeInitCode(size_t LocalSizeX, size_t LocalSizeY, size_t LocalSizeZ) { + + IRBuilder<> builder(getEntryBlock().getFirstNonPHI()); + + GlobalVariable *gv; + + llvm::Module* M = getParent(); + + int size_t_width = 32; + if (M->getPointerSize() == llvm::Module::Pointer64) + size_t_width = 64; + + FunctionType *ft = FunctionType::get + (/*Result=*/ IntegerType::get(M->getContext(), 32), + /*Params=*/ IntegerType::get(M->getContext(), 32), + /*isVarArg=*/ false); + Function *localsize = + dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft)); + gv = M->getGlobalVariable("_local_size_x"); + builder.CreateStore(builder.CreateCall(localsize, + ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 0)), + gv); + gv = M->getGlobalVariable("_local_size_y"); + builder.CreateStore(builder.CreateCall(localsize, + ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 1)), + gv); + gv = M->getGlobalVariable("_local_size_z"); + builder.CreateStore(builder.CreateCall(localsize, + ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 2)), + gv); +} + diff --git a/src/llvmopencl/Kernel.h b/src/llvmopencl/Kernel.h new file mode 100644 index 0000000..5337b54 --- /dev/null +++ b/src/llvmopencl/Kernel.h @@ -0,0 +1,54 @@ +// Class for kernels, a special kind of function. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_KERNEL_H +#define _POCL_KERNEL_H + +#include "ParallelRegion.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" + +namespace pocl { + + class Kernel : public llvm::Function { + public: + void getExitBlocks(llvm::SmallVectorImpl<BarrierBlock *> &B); + ParallelRegion *createParallelRegionBefore(BarrierBlock *B); + + ParallelRegion::ParallelRegionVector* + getParallelRegions(llvm::LoopInfo *LI); + + void addLocalSizeInitCode(size_t LocalSizeX, size_t LocalSizeY, size_t LocalSizeZ); + + static bool isKernel(const llvm::Function &F); + + static bool classof(const Kernel *) { return true; } + // We assume any function can be a kernel. This could be used + // to check for metadata (but would need to be overrideable somehow + // to honor the forced kernel name(s) parameter in command line. + static bool classof(const llvm::Function *) { return true; } + }; + +} + +#endif diff --git a/src/llvmopencl/LLVMUtils.cc b/src/llvmopencl/LLVMUtils.cc new file mode 100644 index 0000000..aeb02d7 --- /dev/null +++ b/src/llvmopencl/LLVMUtils.cc @@ -0,0 +1,90 @@ +// Implementation of LLVMUtils, useful common LLVM-related functionality. +// +// Copyright (c) 2013 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "LLVMUtils.h" + +#include "config.h" + +#ifdef LLVM_3_2 +#include <llvm/Module.h> +#include <llvm/Metadata.h> +#else +#include <llvm/IR/Module.h> +#include <llvm/IR/Metadata.h> +#endif + +using namespace llvm; + +/** + * Regenerates the metadata that points to the original kernel + * (of which finger print was modified) to point to the new + * kernel. + * + * Only checks if the first operand of the metadata is the kernel + * function. + */ +void +regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels) +{ + // reproduce the opencl.kernel_wg_size_info metadata + NamedMDNode *wg_sizes = M.getNamedMetadata("opencl.kernel_wg_size_info"); + if (wg_sizes != NULL && wg_sizes->getNumOperands() > 0) + { + for (std::size_t mni = 0; mni < wg_sizes->getNumOperands(); ++mni) + { + MDNode *wgsizeMD = dyn_cast<MDNode>(wg_sizes->getOperand(mni)); + for (FunctionMapping::const_iterator i = kernels.begin(), + e = kernels.end(); i != e; ++i) + { + Function *old_kernel = (*i).first; + Function *new_kernel = (*i).second; + if (old_kernel == new_kernel || wgsizeMD->getNumOperands() == 0 || + dyn_cast<Function>(wgsizeMD->getOperand(0)) != old_kernel) + continue; + // found a wg size metadata that points to the old kernel, copy its + // operands except the first one to a new MDNode + SmallVector<Value*, 8> operands; + operands.push_back(new_kernel); + for (unsigned opr = 1; opr < wgsizeMD->getNumOperands(); ++opr) + { + operands.push_back(wgsizeMD->getOperand(opr)); + } + MDNode *new_wg_md = MDNode::get(M.getContext(), operands); + wg_sizes->addOperand(new_wg_md); + } + } + } + + // reproduce the opencl.kernels metadata + NamedMDNode *nmd = M.getNamedMetadata("opencl.kernels"); + if (nmd) + M.eraseNamedMetadata(nmd); + + nmd = M.getOrInsertNamedMetadata("opencl.kernels"); + for (FunctionMapping::const_iterator i = kernels.begin(), + e = kernels.end(); + i != e; ++i) { + MDNode *md = MDNode::get(M.getContext(), ArrayRef<Value *>((*i).second)); + nmd->addOperand(md); + } +} + diff --git a/src/llvmopencl/LLVMUtils.h b/src/llvmopencl/LLVMUtils.h new file mode 100644 index 0000000..e6a89db --- /dev/null +++ b/src/llvmopencl/LLVMUtils.h @@ -0,0 +1,38 @@ +// Header for LLVMUtils, useful common LLVM-related functionality. +// +// Copyright (c) 2013 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_LLVM_UTILS_H +#define _POCL_LLVM_UTILS_H + +#include <map> + +namespace llvm { + class Module; + class Function; +} + +typedef std::map<llvm::Function*, llvm::Function*> FunctionMapping; + +void +regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels); + +#endif diff --git a/src/llvmopencl/LoopBarriers.cc b/src/llvmopencl/LoopBarriers.cc new file mode 100644 index 0000000..5e4965f --- /dev/null +++ b/src/llvmopencl/LoopBarriers.cc @@ -0,0 +1,194 @@ +// LLVM loop pass that adds required barriers to loops. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// 2012-2014 Pekka Jääskeläinen / Tampere University of Technology +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#endif +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <iostream> + +#include "LoopBarriers.h" +#include "Barrier.h" +#include "Workgroup.h" + +//#define DEBUG_LOOP_BARRIERS + +using namespace llvm; +using namespace pocl; + +namespace { + static + RegisterPass<LoopBarriers> X("loop-barriers", + "Add needed barriers to loops"); +} + +char LoopBarriers::ID = 0; + +void +LoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); +} + +bool +LoopBarriers::runOnLoop(Loop *L, LPPassManager &LPM) +{ + if (!Workgroup::isKernelToProcess(*L->getHeader()->getParent())) + return false; + + DT = &getAnalysis<DominatorTree>(); + + bool changed = ProcessLoop(L, LPM); + + DT->verifyAnalysis(); + + return changed; +} + + +bool +LoopBarriers::ProcessLoop(Loop *L, LPPassManager &LPM) +{ + bool isBLoop = false; + bool changed = false; + + for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); + i != e && !isBLoop; ++i) { + for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); + j != e; ++j) { + if (isa<Barrier>(j)) { + isBLoop = true; + break; + } + } + } + + for (Loop::block_iterator i = L->block_begin(), e = L->block_end(); + i != e && isBLoop; ++i) { + for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end(); + j != e; ++j) { + if (isa<Barrier>(j)) { + + // Found a barrier in this loop: + // 1) add a barrier in the loop header. + // 2) add a barrier in the latches + + // Add a barrier on the preheader to ensure all WIs reach + // the loop header with all the previous code already + // executed. + BasicBlock *preheader = L->getLoopPreheader(); + assert((preheader != NULL) && "Non-canonicalized loop found!\n"); +#ifdef DEBUG_LOOP_BARRIERS + std::cerr << "### adding to preheader BB" << std::endl; + preheader->dump(); + std::cerr << "### before instr" << std::endl; + preheader->getTerminator()->dump(); +#endif + Barrier::Create(preheader->getTerminator()); + preheader->setName(preheader->getName() + ".loopbarrier"); + + // Add a barrier after the PHI nodes on the header (the replicated + // headers will be merged afterwards). + BasicBlock *header = L->getHeader(); + if (header->getFirstNonPHI() != &header->front()) { + Barrier::Create(header->getFirstNonPHI()); + header->setName(header->getName() + ".phibarrier"); + // Split the block to create a replicable region of + // the loop contents in case the phi node contains a + // branch (that can be to inside the region). + // if (header->getTerminator()->getNumSuccessors() > 1) + // SplitBlock(header, header->getTerminator(), this); + } + + // Add the barriers on the exiting block and the latches, + // which might not always be the same if there is computation + // after the exit decision. + BasicBlock *brexit = L->getExitingBlock(); + if (brexit != NULL) { + Barrier::Create(brexit->getTerminator()); + brexit->setName(brexit->getName() + ".brexitbarrier"); + } + + BasicBlock *latch = L->getLoopLatch(); + if (latch != NULL && brexit != latch) { + // This loop has only one latch. Do not check for dominance, we + // are probably running before BTR. + Barrier::Create(latch->getTerminator()); + latch->setName(latch->getName() + ".latchbarrier"); + return changed; + } + + // Modified code from llvm::LoopBase::getLoopLatch to + // go trough all the latches. + BasicBlock *Header = L->getHeader(); + typedef GraphTraits<Inverse<BasicBlock *> > InvBlockTraits; + InvBlockTraits::ChildIteratorType PI = InvBlockTraits::child_begin(Header); + InvBlockTraits::ChildIteratorType PE = InvBlockTraits::child_end(Header); + BasicBlock *Latch = NULL; + for (; PI != PE; ++PI) { + InvBlockTraits::NodeType *N = *PI; + if (L->contains(N)) { + Latch = N; + // Latch found in the loop, see if the barrier dominates it + // (otherwise if might no even belong to this "tail", see + // forifbarrier1 graph test). + if (DT->dominates(j->getParent(), Latch)) { + Barrier::Create(Latch->getTerminator()); + Latch->setName(Latch->getName() + ".latchbarrier"); + } + } + } + return true; + } + } + } + + /* This is a loop without a barrier. Ensure we have a non-barrier + block as a preheader so we can replicate the loop as a whole. + + If the block has proper instructions after the barrier, it + will be split in CanonicalizeBarriers. */ + BasicBlock *preheader = L->getLoopPreheader(); + assert((preheader != NULL) && "Non-canonicalized loop found!\n"); + TerminatorInst *t = preheader->getTerminator(); + Instruction *prev = NULL; + if (&preheader->front() != t) + prev = t->getPrevNode(); + if (prev && isa<Barrier>(prev)) + { + BasicBlock *new_b = SplitBlock(preheader, t, this); + new_b->setName(preheader->getName() + ".postbarrier_dummy"); + return true; + } + + return changed; +} + diff --git a/src/llvmopencl/LoopBarriers.h b/src/llvmopencl/LoopBarriers.h new file mode 100644 index 0000000..6d80de6 --- /dev/null +++ b/src/llvmopencl/LoopBarriers.h @@ -0,0 +1,47 @@ +// Header for LoopBarriers.cc function pass. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef POCL_LOOP_BARRIERS_H +#define POCL_LOOP_BARRIERS_H + +#include "llvm/Analysis/LoopPass.h" +#include <set> + +namespace pocl { + class LoopBarriers : public llvm::LoopPass { + + public: + static char ID; + + LoopBarriers() : LoopPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnLoop(llvm::Loop *L, llvm::LPPassManager &LPM); + + private: + llvm::DominatorTree *DT; + + bool ProcessLoop(llvm::Loop *L, llvm::LPPassManager &LPM); + }; +} + +#endif diff --git a/src/llvmopencl/Makefile.am b/src/llvmopencl/Makefile.am new file mode 100644 index 0000000..881a35c --- /dev/null +++ b/src/llvmopencl/Makefile.am @@ -0,0 +1,53 @@ +# Process this file with automake to produce Makefile.in (in this, +# and all subdirectories). +# Makefile.am for pocl/lib/llvmopencl. +# +# Copyright (c) 2011 Universidad Rey Juan Carlos +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +pkglib_LTLIBRARIES = llvmopencl.la + +AM_CXXFLAGS = -I@top_srcdir@/fix-include -I@top_srcdir@/include `@LLVM_CONFIG@ --cxxflags` +AM_LDFLAGS = -module -export-dynamic -version-info ${KERNEL_COMPILER_LIB_VERSION} `@LLVM_CONFIG@ --ldflags` +llvmopencl_la_LIBADD = -lLLVM-${LLVM_VERSION} + +llvmopencl_la_SOURCES = Barrier.h \ + BarrierBlock.h BarrierBlock.cc \ + Kernel.h Kernel.cc \ + ParallelRegion.h ParallelRegion.cc \ + CanonicalizeBarriers.h CanonicalizeBarriers.cc \ + LoopBarriers.h LoopBarriers.cc \ + GenerateHeader.cc Workgroup.h Workgroup.cc \ + BarrierTailReplication.h BarrierTailReplication.cc \ + Flatten.cc IsolateRegions.h IsolateRegions.cc \ + WorkitemReplication.h WorkitemReplication.cc \ + ImplicitLoopBarriers.h ImplicitLoopBarriers.cc \ + WorkItemAliasAnalysis.cc WIVectorize.cc \ + WorkitemHandler.h WorkitemHandler.cc \ + WorkitemLoops.h WorkitemLoops.cc \ + PHIsToAllocas.h PHIsToAllocas.cc \ + BreakConstantGEPs.h BreakConstantGEPs.cpp \ + WorkitemHandlerChooser.h WorkitemHandlerChooser.cc \ + AllocasToEntry.h AllocasToEntry.cc \ + TargetAddressSpaces.h TargetAddressSpaces.cc \ + LLVMUtils.cc LLVMUtils.h \ + VariableUniformityAnalysis.h VariableUniformityAnalysis.cc + +#llvmopencl_la_LIBADD += @LIBS_LLVMTRANSFORMUTILS@ diff --git a/src/llvmopencl/Makefile.in b/src/llvmopencl/Makefile.in new file mode 100644 index 0000000..e4dd24b --- /dev/null +++ b/src/llvmopencl/Makefile.in @@ -0,0 +1,822 @@ +# Makefile.in generated by automake 1.14 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2014 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# Process this file with automake to produce Makefile.in (in this, +# and all subdirectories). +# Makefile.am for pocl/lib/llvmopencl. +# +# Copyright (c) 2011 Universidad Rey Juan Carlos +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +VPATH = @srcdir@ +am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ +subdir = lib/llvmopencl +DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ + $(top_srcdir)/config/depcomp +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +am__installdirs = "$(DESTDIR)$(pkglibdir)" +LTLIBRARIES = $(pkglib_LTLIBRARIES) +llvmopencl_la_DEPENDENCIES = +am_llvmopencl_la_OBJECTS = BarrierBlock.lo Kernel.lo ParallelRegion.lo \ + CanonicalizeBarriers.lo LoopBarriers.lo GenerateHeader.lo \ + Workgroup.lo BarrierTailReplication.lo Flatten.lo \ + IsolateRegions.lo WorkitemReplication.lo \ + ImplicitLoopBarriers.lo WorkItemAliasAnalysis.lo \ + WIVectorize.lo WorkitemHandler.lo WorkitemLoops.lo \ + PHIsToAllocas.lo BreakConstantGEPs.lo \ + WorkitemHandlerChooser.lo AllocasToEntry.lo \ + TargetAddressSpaces.lo LLVMUtils.lo \ + VariableUniformityAnalysis.lo +llvmopencl_la_OBJECTS = $(am_llvmopencl_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/config/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_@AM_V@) +am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) +am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(llvmopencl_la_SOURCES) +DIST_SOURCES = $(llvmopencl_la_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ +BOOST_LDFLAGS = @BOOST_LDFLAGS@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CLANG = @CLANG@ +CLANGXX = @CLANGXX@ +CLFLAGS = @CLFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GLEW_CFLAGS = @GLEW_CFLAGS@ +GLEW_LIBS = @GLEW_LIBS@ +GREP = @GREP@ +HOST = @HOST@ +HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@ +HOST_CPU = @HOST_CPU@ +HOST_LD_FLAGS = @HOST_LD_FLAGS@ +HOST_LLC_FLAGS = @HOST_LLC_FLAGS@ +HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@ +HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@ +HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@ +HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@ +HWLOC_CFLAGS = @HWLOC_CFLAGS@ +HWLOC_LIBS = @HWLOC_LIBS@ +ICD_LD_FLAGS = @ICD_LD_FLAGS@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LD_FLAGS_BIN = @LD_FLAGS_BIN@ +LIBOBJS = @LIBOBJS@ +LIBRARY_SUFFIX = @LIBRARY_SUFFIX@ +LIBS = @LIBS@ +LIBSPE_CFLAGS = @LIBSPE_CFLAGS@ +LIBSPE_LIBS = @LIBSPE_LIBS@ +LIBTOOL = @LIBTOOL@ +LIB_VERSION = @LIB_VERSION@ +LIPO = @LIPO@ +LLC = @LLC@ +LLVM_AR = @LLVM_AR@ +LLVM_AS = @LLVM_AS@ +LLVM_CONFIG = @LLVM_CONFIG@ +LLVM_LINK = @LLVM_LINK@ +LLVM_OPT = @LLVM_OPT@ +LLVM_RANLIB = @LLVM_RANLIB@ +LLVM_VERSION = @LLVM_VERSION@ +LN_S = @LN_S@ +LTDL_LIBS = @LTDL_LIBS@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@ +OCL_ICD_LIBS = @OCL_ICD_LIBS@ +OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@ +OCL_TARGETS = @OCL_TARGETS@ +OPENCL_CFLAGS = @OPENCL_CFLAGS@ +OPENCL_CMAKE = @OPENCL_CMAKE@ +OPENCL_EXTLIBS = @OPENCL_EXTLIBS@ +OPENCL_LIBS = @OPENCL_LIBS@ +OPT = @OPT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +POAT_TESTSUITES = @POAT_TESTSUITES@ +POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@ +PTHREAD_CC = @PTHREAD_CC@ +PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ +PTHREAD_LIBS = @PTHREAD_LIBS@ +RANLIB = @RANLIB@ +SDL_CFLAGS = @SDL_CFLAGS@ +SDL_LIBS = @SDL_LIBS@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +TARGET = @TARGET@ +TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@ +TARGET_CPU = @TARGET_CPU@ +TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@ +TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@ +TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@ +TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@ +TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@ +TCECC = @TCECC@ +TCE_AVAILABLE = @TCE_AVAILABLE@ +TCE_CONFIG = @TCE_CONFIG@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +acx_pthread_config = @acx_pthread_config@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +pkglib_LTLIBRARIES = llvmopencl.la +AM_CXXFLAGS = -I@top_srcdir@/fix-include -I@top_srcdir@/include `@LLVM_CONFIG@ --cxxflags` +AM_LDFLAGS = -module -export-dynamic -version-info ${KERNEL_COMPILER_LIB_VERSION} `@LLVM_CONFIG@ --ldflags` +llvmopencl_la_LIBADD = -lLLVM-${LLVM_VERSION} +llvmopencl_la_SOURCES = Barrier.h \ + BarrierBlock.h BarrierBlock.cc \ + Kernel.h Kernel.cc \ + ParallelRegion.h ParallelRegion.cc \ + CanonicalizeBarriers.h CanonicalizeBarriers.cc \ + LoopBarriers.h LoopBarriers.cc \ + GenerateHeader.cc Workgroup.h Workgroup.cc \ + BarrierTailReplication.h BarrierTailReplication.cc \ + Flatten.cc IsolateRegions.h IsolateRegions.cc \ + WorkitemReplication.h WorkitemReplication.cc \ + ImplicitLoopBarriers.h ImplicitLoopBarriers.cc \ + WorkItemAliasAnalysis.cc WIVectorize.cc \ + WorkitemHandler.h WorkitemHandler.cc \ + WorkitemLoops.h WorkitemLoops.cc \ + PHIsToAllocas.h PHIsToAllocas.cc \ + BreakConstantGEPs.h BreakConstantGEPs.cpp \ + WorkitemHandlerChooser.h WorkitemHandlerChooser.cc \ + AllocasToEntry.h AllocasToEntry.cc \ + TargetAddressSpaces.h TargetAddressSpaces.cc \ + LLVMUtils.cc LLVMUtils.h \ + VariableUniformityAnalysis.h VariableUniformityAnalysis.cc + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .cpp .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign lib/llvmopencl/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign lib/llvmopencl/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES) + @$(NORMAL_INSTALL) + @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \ + list2=; for p in $$list; do \ + if test -f $$p; then \ + list2="$$list2 $$p"; \ + else :; fi; \ + done; \ + test -z "$$list2" || { \ + echo " $(MKDIR_P) '$(DESTDIR)$(pkglibdir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(pkglibdir)" || exit 1; \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(pkglibdir)'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(pkglibdir)"; \ + } + +uninstall-pkglibLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \ + for p in $$list; do \ + $(am__strip_dir) \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(pkglibdir)/$$f'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(pkglibdir)/$$f"; \ + done + +clean-pkglibLTLIBRARIES: + -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES) + @list='$(pkglib_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +llvmopencl.la: $(llvmopencl_la_OBJECTS) $(llvmopencl_la_DEPENDENCIES) $(EXTRA_llvmopencl_la_DEPENDENCIES) + $(AM_V_CXXLD)$(CXXLINK) -rpath $(pkglibdir) $(llvmopencl_la_OBJECTS) $(llvmopencl_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/AllocasToEntry.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BarrierBlock.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BarrierTailReplication.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BreakConstantGEPs.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/CanonicalizeBarriers.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Flatten.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/GenerateHeader.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ImplicitLoopBarriers.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/IsolateRegions.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Kernel.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/LLVMUtils.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/LoopBarriers.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/PHIsToAllocas.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ParallelRegion.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TargetAddressSpaces.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/VariableUniformityAnalysis.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WIVectorize.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkItemAliasAnalysis.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Workgroup.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemHandler.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemHandlerChooser.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemLoops.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemReplication.Plo@am__quote@ + +.cc.o: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< + +.cpp.o: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< + +.cpp.obj: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cpp.lo: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: + for dir in "$(DESTDIR)$(pkglibdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-pkglibLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: install-pkglibLTLIBRARIES + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-pkglibLTLIBRARIES + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-pkglibLTLIBRARIES cscopelist-am ctags \ + ctags-am distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-pkglibLTLIBRARIES install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ + pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am \ + uninstall-pkglibLTLIBRARIES + + +#llvmopencl_la_LIBADD += @LIBS_LLVMTRANSFORMUTILS@ + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src/llvmopencl/PHIsToAllocas.cc b/src/llvmopencl/PHIsToAllocas.cc new file mode 100644 index 0000000..a414412 --- /dev/null +++ b/src/llvmopencl/PHIsToAllocas.cc @@ -0,0 +1,144 @@ +// LLVM function pass to convert all PHIs to allocas. +// +// Copyright (c) 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "PHIsToAllocas.h" +#include "Workgroup.h" +#include "WorkitemHandlerChooser.h" +#include "WorkitemLoops.h" + +#include "config.h" + +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/TypeBuilder.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/TypeBuilder.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/TypeBuilder.h" +#endif + +namespace { + static + llvm::RegisterPass<pocl::PHIsToAllocas> X( + "phistoallocas", "Convert all PHI nodes to allocas"); +} + +namespace pocl { + +char PHIsToAllocas::ID = 0; + +using namespace llvm; + +void +PHIsToAllocas::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<pocl::WorkitemHandlerChooser>(); + AU.addPreserved<pocl::WorkitemHandlerChooser>(); +} + +bool +PHIsToAllocas::runOnFunction(Function &F) +{ + if (!Workgroup::isKernelToProcess(F)) + return false; + + /* Skip PHIsToAllocas when we are not creating the work item loops, + as leads to worse code without benefits for the full replication method. + */ + if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() != + pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS) + return false; + + typedef std::vector<llvm::Instruction* > InstructionVec; + + InstructionVec PHIs; + + for (Function::iterator bb = F.begin(); bb != F.end(); ++bb) { + for (BasicBlock::iterator p = bb->begin(); + p != bb->end(); ++p) + { + Instruction* instr = p; + if (isa<PHINode>(instr)) + { + PHIs.push_back(instr); + } + } + + } + + bool changed = false; + for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end(); + ++i) + { + Instruction *instr = *i; + BreakPHIToAllocas(dyn_cast<PHINode>(instr)); + changed = true; + } + return changed; + +} + +/** + * Convert a PHI to a read from a stack value and all the sources to + * writes to the same stack value. + * + * Used to fix context save/restore issues with regions with PHI nodes in the + * entry node (usually due to the use of work group scope variables such as + * B-loop iteration variables). In case of PHI nodes at region entries, we cannot + * just insert the context restore code because it is assumed there are no + * non-phi Instructions before PHIs which the context restore + * code constitutes to. Secondly, in case the PHINode is at a + * region entry (e.g. a B-Loop) adding new basic blocks before it would + * break the assumption of single entry regions. + */ +llvm::Instruction * +PHIsToAllocas::BreakPHIToAllocas(PHINode* phi) +{ + std::string allocaName = std::string(phi->getName().str()) + ".ex_phi"; + + llvm::Function *function = phi->getParent()->getParent(); + IRBuilder<> builder(function->getEntryBlock().getFirstInsertionPt()); + + llvm::Instruction *alloca = + builder.CreateAlloca(phi->getType(), 0, allocaName); + + for (unsigned incoming = 0; incoming < phi->getNumIncomingValues(); + ++incoming) + { + Value *val = phi->getIncomingValue(incoming); + BasicBlock *incomingBB = phi->getIncomingBlock(incoming); + builder.SetInsertPoint(incomingBB->getTerminator()); + builder.CreateStore(val, alloca); + } + + builder.SetInsertPoint(phi); + + llvm::Instruction *loadedValue = builder.CreateLoad(alloca); + phi->replaceAllUsesWith(loadedValue); + phi->eraseFromParent(); + return loadedValue; +} + + +} diff --git a/src/llvmopencl/PHIsToAllocas.h b/src/llvmopencl/PHIsToAllocas.h new file mode 100644 index 0000000..819dcfc --- /dev/null +++ b/src/llvmopencl/PHIsToAllocas.h @@ -0,0 +1,56 @@ +// Header for PHIsToAllocas function pass. +// +// Copyright (c) 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_PHIS_TO_ALLOCAS_H +#define _POCL_PHIS_TO_ALLOCAS_H + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Function.h" +#else +#include "llvm/IR/Function.h" +#endif +#include "llvm/Pass.h" + +namespace llvm { + class Instruction; + class PHINode; +} + +namespace pocl { + class Workgroup; + + class PHIsToAllocas : public llvm::FunctionPass { + public: + static char ID; + + PHIsToAllocas() : llvm::FunctionPass(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnFunction(llvm::Function &F); + + llvm::Instruction *BreakPHIToAllocas(llvm::PHINode* phi); + + }; +} + +#endif diff --git a/src/llvmopencl/ParallelRegion.cc b/src/llvmopencl/ParallelRegion.cc new file mode 100644 index 0000000..72d89c1 --- /dev/null +++ b/src/llvmopencl/ParallelRegion.cc @@ -0,0 +1,809 @@ +// Class definition for parallel regions, a group of BasicBlocks that +// each kernel should run in parallel. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos and +// 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "ParallelRegion.h" +#include "Barrier.h" +#include "Kernel.h" +#include "config.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/ValueSymbolTable.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/ValueSymbolTable.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ValueSymbolTable.h" +#endif +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" + +#include <set> +#include <sstream> +#include <map> +#include <algorithm> + +using namespace std; +using namespace llvm; +using namespace pocl; + +//#define DEBUG_REMAP +//#define DEBUG_REPLICATE +//#define DEBUG_PURGE + +#include <iostream> + +int ParallelRegion::idGen = 0; + + +ParallelRegion::ParallelRegion(int forcedRegionId) : + std::vector<llvm::BasicBlock *>(), + LocalIDXLoadInstr(NULL), LocalIDYLoadInstr(NULL), LocalIDZLoadInstr(NULL), + exitIndex_(0), entryIndex_(0), pRegionId(forcedRegionId) +{ + if (forcedRegionId == -1) + pRegionId = idGen++; +} + +/** + * Ensure all variables are named so they will be replicated and renamed + * correctly. + */ +void +ParallelRegion::GenerateTempNames(llvm::BasicBlock *bb) +{ + for (llvm::BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; ++i) + { + llvm::Instruction *instr = i; + if (instr->hasName() || !instr->isUsedOutsideOfBlock(bb)) continue; + int tempCounter = 0; + std::string tempName = ""; + do { + std::ostringstream name; + name << ".pocl_temp." << tempCounter; + ++tempCounter; + tempName = name.str(); + } while (bb->getParent()->getValueSymbolTable().lookup(tempName) != NULL); + instr->setName(tempName); + } +} + +// BarrierBlock * +// ParallelRegion::getEntryBarrier() +// { +// BasicBlock *entry = front(); +// BasicBlock *barrier = entry->getSinglePredecessor(); + +// return cast<BarrierBlock> (barrier); +// } + +ParallelRegion * +ParallelRegion::replicate(ValueToValueMapTy &map, + const Twine &suffix = "") +{ + ParallelRegion *new_region = new ParallelRegion(pRegionId); + + /* Because ParallelRegions are all replicated before they + are attached to the function, it can happen that + the same BB is replicated multiple times and it gets + the same name (only the BB name will be autorenamed + by LLVM). This causes the variable references to become + broken. This hack ensures the BB suffixes are unique + before cloning so each path gets their own value + names. Split points can be such paths.*/ + static std::map<std::string, int> cloneCounts; + + for (iterator i = begin(), e = end(); i != e; ++i) { + BasicBlock *block = *i; + GenerateTempNames(block); + std::ostringstream suf; + suf << suffix.str(); + std::string block_name = block->getName().str() + "." + suffix.str(); + if (cloneCounts[block_name] > 0) + { + suf << ".pocl_" << cloneCounts[block_name]; + } + BasicBlock *new_block = CloneBasicBlock(block, map, suf.str()); + cloneCounts[block_name]++; + // Insert the block itself into the map. + map[block] = new_block; + new_region->push_back(new_block); + +#ifdef DEBUG_REPLICATE + std::cerr << "### clonee block:" << std::endl; + block->dump(); + std::cerr << endl << "### cloned block: " << std::endl; + new_block->dump(); +#endif + } + + new_region->exitIndex_ = exitIndex_; + new_region->entryIndex_ = entryIndex_; + /* Remap here to get local variables fixed before they + are (possibly) overwritten by another clone of the + same BB. */ + new_region->remap(map); + +#ifdef DEBUG_REPLICATE + Verify(); +#endif + LocalizeIDLoads(); + + return new_region; +} + +void +ParallelRegion::remap(ValueToValueMapTy &map) +{ + for (iterator i = begin(), e = end(); i != e; ++i) { + +#ifdef DEBUG_REMAP + std::cerr << "### block before remap:" << std::endl; + (*i)->dump(); +#endif + + for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end(); + ii != ee; ++ii) + RemapInstruction(ii, map, + RF_IgnoreMissingEntries | RF_NoModuleLevelChanges); + +#ifdef DEBUG_REMAP + std::cerr << endl << "### block after remap: " << std::endl; + (*i)->dump(); +#endif + } +} + +void +ParallelRegion::chainAfter(ParallelRegion *region) +{ + /* If we are replicating a conditional barrier + region, the last block can be an unreachable + block to mark the impossible path. Skip + it and choose the correct branch instead. + + TODO: why have the unreachable block there the + first place? Could we just not add it and fix + the branch? */ + BasicBlock *tail = region->exitBB(); + TerminatorInst *t = tail->getTerminator(); + if (isa<UnreachableInst>(t)) + { + tail = region->at(region->size() - 2); + t = tail->getTerminator(); + } + if (t->getNumSuccessors() != 1) + { + std::cout << "!!! trying to chain region" << std::endl; + this->dumpNames(); + std::cout << "!!! after region" << std::endl; + region->dumpNames(); + t->getParent()->dump(); + + assert (t->getNumSuccessors() == 1); + } + + BasicBlock *successor = t->getSuccessor(0); + Function::BasicBlockListType &bb_list = + successor->getParent()->getBasicBlockList(); + + for (iterator i = begin(), e = end(); i != e; ++i) + bb_list.insertAfter(tail, *i); + + t->setSuccessor(0, entryBB()); + + t = exitBB()->getTerminator(); + assert (t->getNumSuccessors() == 1); + t->setSuccessor(0, successor); +} + +void +ParallelRegion::purge() +{ + SmallVector<BasicBlock *, 4> new_blocks; + + for (iterator i = begin(), e = end(); i != e; ++i) { + + // Exit block has a successor out of the region. + if (*i == exitBB()) + continue; + +#ifdef DEBUG_PURGE + std::cerr << "### block before purge:" << std::endl; + (*i)->dump(); +#endif + TerminatorInst *t = (*i)->getTerminator(); + for (unsigned ii = 0, ee = t->getNumSuccessors(); ii != ee; ++ii) { + BasicBlock *successor = t->getSuccessor(ii); + if (count(begin(), end(), successor) == 0) { + // This successor is not on the parallel region, purge. + iterator next_block = i; + ++next_block; + assert ((*i)->getParent() != NULL && *next_block != NULL); + BasicBlock *unreachable = + BasicBlock::Create((*i)->getContext(), + (*i)->getName() + ".unreachable", + (*i)->getParent(), + *next_block); + new UnreachableInst(unreachable->getContext(), + unreachable); + t->setSuccessor(ii, unreachable); + new_blocks.push_back(unreachable); + } + } +#ifdef DEBUG_PURGE + std::cerr << std::endl << "### block after purge:" << std::endl; + (*i)->dump(); +#endif + } + + // Add the new "unreachable" blocks to the + // region. We cannot do in the loop as it + // corrupts iterators. + insert(end(), new_blocks.begin(), new_blocks.end()); +} + +void +ParallelRegion::insertLocalIdInit(llvm::BasicBlock* entry, + unsigned x, + unsigned y, + unsigned z) +{ + IRBuilder<> builder(entry, entry->getFirstInsertionPt()); + + Module *M = entry->getParent()->getParent(); + + int size_t_width = 32; + if (M->getPointerSize() == llvm::Module::Pointer64) + size_t_width = 64; + + GlobalVariable *gvx = M->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL); + if (gvx != NULL) + builder.CreateStore(ConstantInt::get(IntegerType:: + get(M->getContext(), size_t_width), + x), gvx); + + GlobalVariable *gvy = M->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL); + if (gvy != NULL) + builder.CreateStore(ConstantInt::get(IntegerType:: + get(M->getContext(), size_t_width), + y), gvy); + + GlobalVariable *gvz = M->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL); + if (gvz != NULL) + builder.CreateStore(ConstantInt::get(IntegerType:: + get(M->getContext(), size_t_width), + z), gvz); +} + +void +ParallelRegion::insertPrologue(unsigned x, + unsigned y, + unsigned z) +{ + BasicBlock *entry = entryBB(); + ParallelRegion::insertLocalIdInit(entry, x, y, z); +} + +void +ParallelRegion::dump() +{ + for (iterator i = begin(), e = end(); i != e; ++i) + (*i)->dump(); +} + +void +ParallelRegion::dumpNames() +{ + for (iterator i = begin(), e = end(); i != e; ++i) + { + std::cout << (*i)->getName().str(); + if (entryBB() == (*i)) + std::cout << "(EN)"; + if (exitBB() == (*i)) + std::cout << "(EX)"; + std::cout << " "; + } + std::cout << std::endl; +} + +ParallelRegion * +ParallelRegion::Create(const SmallPtrSet<BasicBlock *, 8>& bbs, BasicBlock *entry, BasicBlock *exit) +{ + ParallelRegion *new_region = new ParallelRegion(); + + assert (entry != NULL); + assert (exit != NULL); + + // This is done in two steps so order of the vector + // is the same as original function order. + Function *F = entry->getParent(); + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + BasicBlock *b = i; + for (SmallPtrSetIterator<BasicBlock *> j = bbs.begin(); j != bbs.end(); ++j) { + if (*j == b) { + new_region->push_back(i); + if (entry == *j) + new_region->setEntryBBIndex(new_region->size() - 1); + else if (exit == *j) + new_region->setExitBBIndex(new_region->size() - 1); + break; + } + } + } + + new_region->LocalizeIDLoads(); + + assert(new_region->Verify()); + + return new_region; +} + +bool +ParallelRegion::Verify() +{ + // Parallel region conditions: + // 1) Single entry, in entry block. + // 2) Single outgoing edge from exit block + // (other outgoing edges allowed, will be purged in replicas). + // 3) No barriers inside the region. + + int entry_edges = 0; + + for (iterator i = begin(), e = end(); i != e; ++i) { + for (pred_iterator ii(*i), ee(*i, true); ii != ee; ++ii) { + if (count(begin(), end(), *ii) == 0) { + if ((*i) != entryBB()) { + dumpNames(); + std::cerr << "suspicious block: " << (*i)->getName().str() << std::endl; + std::cerr << "the entry is: " << entryBB()->getName().str() << std::endl; + +#if 0 + (*i)->getParent()->viewCFG(); +#endif + assert(0 && "Incoming edges to non-entry block!"); + return false; + } else if (!Barrier::hasBarrier(*ii)) { + (*i)->getParent()->viewCFG(); + assert (0 && "Entry has edges from non-barrier blocks!"); + return false; + } + ++entry_edges; + } + } + + // if (entry_edges != 1) { + // assert(0 && "Parallel regions must be single entry!"); + // return false; + // } + + if (exitBB()->getTerminator()->getNumSuccessors() != 1) { + assert(0 && "Multiple outgoing edges from exit block!"); + return false; + } + + for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end(); + ii != ee; ++ii) { + if (isa<Barrier> (ii)) { + assert(0 && "Barrier found inside parallel region!"); + return false; + } + } + } + + return true; +} + +/** + * Adds metadata to all the memory instructions to denote + * they originate from a parallel loop. + * + * Due to nested parallel loops, there can be multiple loop + * references. + * + * Format: + * llvm.mem.parallel_loop_access !0 + * + * !0 { metadata !0 } + * + * In a 2-nested loop: + * + * llvm.mem.parallel_loop_access !0 + * + * !0 { metadata !1, metadata !2} + * !1 { metadata !1 } + * !2 { metadata !2 } + */ +void +ParallelRegion::AddParallelLoopMetadata(llvm::MDNode *identifier) { + + for (iterator i = begin(), e = end(); i != e; ++i) { + BasicBlock* bb = *i; + for (BasicBlock::iterator ii = bb->begin(), ee = bb->end(); + ii != ee; ii++) { + if (ii->mayReadOrWriteMemory()) { + std::vector<Value*> loopIds; + MDNode *oldIds = ii->getMetadata("llvm.mem.parallel_loop_access"); + if (oldIds != NULL) { + for (unsigned i = 0; i < oldIds->getNumOperands(); ++i) { + loopIds.push_back(oldIds->getOperand(i)); + } + } + loopIds.push_back(identifier); + ii->setMetadata("llvm.mem.parallel_loop_access", + MDNode::get(bb->getContext(), loopIds)); + } + } + } +} + +void +ParallelRegion::AddIDMetadata( + llvm::LLVMContext& context, + std::size_t x, + std::size_t y, + std::size_t z) { + + int counter = 1; + Value *v1[] = { + MDString::get(context, "WI_region"), + ConstantInt::get(Type::getInt32Ty(context), pRegionId)}; + MDNode* mdRegion = MDNode::get(context, v1); + Value *v2[] = { + MDString::get(context, "WI_xyz"), + ConstantInt::get(Type::getInt32Ty(context), x), + ConstantInt::get(Type::getInt32Ty(context), y), + ConstantInt::get(Type::getInt32Ty(context), z)}; + MDNode* mdXYZ = MDNode::get(context, v2); + Value *v[] = { + MDString::get(context, "WI_data"), + mdRegion, + mdXYZ}; + MDNode* md = MDNode::get(context, v); + + for (iterator i = begin(), e = end(); i != e; ++i) { + BasicBlock* bb = *i; + for (BasicBlock::iterator ii = bb->begin(); + ii != bb->end(); ii++) { + Value *v3[] = { + MDString::get(context, "WI_counter"), + ConstantInt::get(Type::getInt32Ty(context), counter)}; + MDNode* mdCounter = MDNode::get(context, v3); + counter++; + ii->setMetadata("wi", md); + ii->setMetadata("wi_counter", mdCounter); + } + } +} + + +/** + * Inserts a new basic block to the region, before an old basic block in + * the region. + * + * Assumes the inserted block to be before the other block in control + * flow, that is, there should be direct CFG edge from the block to the + * other. + */ +void +ParallelRegion::AddBlockBefore(llvm::BasicBlock *block, llvm::BasicBlock *before) +{ + llvm::BasicBlock *oldExit = exitBB(); + ParallelRegion::iterator beforePos = find(begin(), end(), before); + ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit); + assert (beforePos != end()); + + /* The old exit node might is now pushed further, at most one position. + Whether this is the case, depends if the node was inserted before or + after that node in the vector. That is, if indexof(before) < indexof(oldExit). */ + if (beforePos < oldExitPos) ++exitIndex_; + + insert(beforePos, block); + /* The entryIndex_ should be still correct. In case the 'before' block + was an old entry node, the new one replaces it as an entry node at + the same index and the old one gets pushed forward. */ +} + + +void +ParallelRegion::AddBlockAfter(llvm::BasicBlock *block, llvm::BasicBlock *after) +{ + llvm::BasicBlock *oldExit = exitBB(); + ParallelRegion::iterator afterPos = find(begin(), end(), after); + ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit); + assert (afterPos != end()); + + /* The old exit node might be pushed further, at most one position. + Whether this is the case, depends if the node was inserted before or + after that node in the vector. That is, if indexof(before) < indexof(oldExit). */ + if (afterPos < oldExitPos) ++exitIndex_; + afterPos++; + insert(afterPos, block); +} + +bool +ParallelRegion::HasBlock(llvm::BasicBlock *bb) +{ + return find(begin(), end(), bb) != end(); +} + +/** + * Find the instruction that loads the Z dimension of the work item + * in the beginning of the parallel region, if not found, creates it. + */ +llvm::Instruction* +ParallelRegion::LocalIDZLoad() +{ + if (LocalIDZLoadInstr != NULL) return LocalIDZLoadInstr; + IRBuilder<> builder(entryBB()->getFirstInsertionPt()); + return LocalIDZLoadInstr = + builder.CreateLoad + (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL)); +} + +/** + * Find the instruction that loads the Y dimension of the work item + * in the beginning of the parallel region, if not found, creates it. + */ +llvm::Instruction* +ParallelRegion::LocalIDYLoad() +{ + if (LocalIDYLoadInstr != NULL) return LocalIDYLoadInstr; + IRBuilder<> builder(entryBB()->getFirstInsertionPt()); + return LocalIDYLoadInstr = + builder.CreateLoad + (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL)); +} + +/** + * Find the instruction that loads the X dimension of the work item + * in the beginning of the parallel region, if not found, creates it. + */ +llvm::Instruction* +ParallelRegion::LocalIDXLoad() +{ + if (LocalIDXLoadInstr != NULL) return LocalIDXLoadInstr; + IRBuilder<> builder(entryBB()->getFirstInsertionPt()); + return LocalIDXLoadInstr = + builder.CreateLoad + (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL)); +} + +void +ParallelRegion::InjectPrintF +(llvm::Instruction *before, std::string formatStr, + std::vector<Value*>& params) +{ + IRBuilder<> builder(before); + llvm::Module *M = before->getParent()->getParent()->getParent(); + + llvm::Value *stringArg = + builder.CreateGlobalString(formatStr); + + /* generated with help from http://llvm.org/demo/index.cgi */ + Function* printfFunc = M->getFunction("printf"); + if (printfFunc == NULL) { + PointerType* PointerTy_4 = PointerType::get(IntegerType::get(M->getContext(), 8), 0); + + std::vector<Type*> FuncTy_6_args; + FuncTy_6_args.push_back(PointerTy_4); + + FunctionType* FuncTy_6 = + FunctionType::get + (/*Result=*/IntegerType::get(M->getContext(), 32), + /*Params=*/FuncTy_6_args, + /*isVarArg=*/true); + + printfFunc = + Function::Create + (/*Type=*/FuncTy_6, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"printf", M); + printfFunc->setCallingConv(CallingConv::C); + +#if (defined LLVM_3_1 or defined LLVM_3_2) + AttrListPtr func_printf_PAL; +#else + AttributeSet func_printf_PAL; +#endif + { +#ifdef LLVM_3_1 + SmallVector<AttributeWithIndex, 4> Attrs; + AttributeWithIndex PAWI; + PAWI.Index = 1U; + PAWI.Attrs = Attribute::NoCapture; + Attrs.push_back(PAWI); + PAWI.Index = 4294967295U; + PAWI.Attrs = Attribute::NoUnwind; + Attrs.push_back(PAWI); + func_printf_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end()); +#elif defined LLVM_3_2 + SmallVector<AttributeWithIndex, 4> Attrs; + Attrs.push_back(AttributeWithIndex::get(M->getContext(), 1U, Attributes::NoCapture)); + Attrs.push_back(AttributeWithIndex::get(M->getContext(), 4294967295U, Attributes::NoUnwind)); + func_printf_PAL = AttrListPtr::get(M->getContext(), Attrs); +#else + func_printf_PAL.addAttribute( M->getContext(), 1U, Attribute::NoCapture); + func_printf_PAL.addAttribute( M->getContext(), 4294967295U, Attribute::NoUnwind); +#endif + } + printfFunc->setAttributes(func_printf_PAL); + } + + std::vector<Constant*> const_ptr_8_indices; + + ConstantInt* const_int64_9 = ConstantInt::get(M->getContext(), APInt(64, StringRef("0"), 10)); + const_ptr_8_indices.push_back(const_int64_9); + const_ptr_8_indices.push_back(const_int64_9); + assert (isa<Constant>(stringArg)); + Constant* const_ptr_8 = + ConstantExpr::getGetElementPtr + (cast<Constant>(stringArg), const_ptr_8_indices); + + std::vector<Value*> args; + args.push_back(const_ptr_8); + args.insert(args.end(), params.begin(), params.end()); + + CallInst::Create(printfFunc, args, "", before); +} + +void +ParallelRegion::SetExitBB(llvm::BasicBlock *block) +{ + for (size_t i = 0; i < size(); ++i) + { + if (at(i) == block) + { + setExitBBIndex(i); + return; + } + } + assert (false && "The block was not found in the PRegion!"); +} + +/** + * Adds a printf to the end of the parallel region that prints the + * region ID and the work item ID. + * + * Useful for debugging control flow bugs. + */ +void +ParallelRegion::InjectRegionPrintF() +{ + llvm::Module *M = entryBB()->getParent()->getParent(); + +#if 0 + // it should reuse equal strings anyways + const char* FORMAT_STR_VAR = ".pocl.pRegion_debug_str"; + llvm::Value *stringArg = M->getGlobalVariable(FORMAT_STR_VAR); + if (stringArg == NULL) + { + IRBuilder<> builder(entryBB()); + stringArg = builder.CreateGlobalString("PR %d WI %u %u %u\n", FORMAT_STR_VAR); + } +#endif + + ConstantInt* pRID = ConstantInt::get(M->getContext(), APInt(32, pRegionId, 10)); + std::vector<Value*> params; + params.push_back(pRID); + params.push_back(LocalIDXLoad()); + params.push_back(LocalIDYLoad()); + params.push_back(LocalIDZLoad()); + + InjectPrintF(exitBB()->getTerminator(), "PR %d WI %u %u %u\n", params); + +} + +/** + * Adds a printf to the end of the parallel region that prints the + * hex contents of all named non-pointer variables. + * + * Useful for debugging data flow bugs. + */ +void +ParallelRegion::InjectVariablePrintouts() +{ + for (ParallelRegion::iterator i = begin(); + i != end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instr = bb->begin(); + instr != bb->end(); ++instr) + { + llvm::Instruction *instruction = instr; + if (isa<PointerType>(instruction->getType()) || + !instruction->hasName()) continue; + std::string name = instruction->getName().str(); + std::vector<Value*> args; + IRBuilder<> builder(exitBB()->getTerminator()); + args.push_back(builder.CreateGlobalString(name)); + args.push_back(instruction); + InjectPrintF(instruction->getParent()->getTerminator(), "variable %s == %x\n", args); + } + } +} + +/** + * Localizes all the loads to the the work-item identifiers. + * + * In case the code inside the region queries the WI id, it + * should not (re)use one that is loaded in another region, but + * one that is loaded in the same region. Otherwise, it ends + * up using the last id the previous PR work-item loop got. + * This caused problems in cases where the local id was stored + * to a temporary variable in an earlier region and that temp + * was reused later. + * + * The function scans for all loads from the local id variables + * and converts them to loads inside the parallel region. + */ +void +ParallelRegion::LocalizeIDLoads() +{ + /* The local id loads inside the parallel region. */ + llvm::Instruction* LocalIDXLoadInstr = LocalIDXLoad(); + llvm::Instruction* LocalIDYLoadInstr = LocalIDYLoad(); + llvm::Instruction* LocalIDZLoadInstr = LocalIDZLoad(); + llvm::Module *M = LocalIDXLoadInstr->getParent()->getParent()->getParent(); + llvm::Value *localIdZ = M->getNamedGlobal(POCL_LOCAL_ID_Z_GLOBAL); + llvm::Value *localIdY = M->getNamedGlobal(POCL_LOCAL_ID_Y_GLOBAL); + llvm::Value *localIdX = M->getNamedGlobal(POCL_LOCAL_ID_X_GLOBAL); + + assert (localIdZ != NULL && localIdY != NULL && localIdX != NULL && + "The local id globals were not created."); + + for (ParallelRegion::iterator i = begin(); + i != end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instrI = bb->begin(); + instrI != bb->end(); ++instrI) + { + llvm::Instruction *instr = instrI; + if (instr == LocalIDXLoadInstr || + instr == LocalIDYLoadInstr || + instr == LocalIDZLoadInstr) continue; + + /* Search all operands of the instruction. If any of them is + using a local id, replace it with the intra-PR load from the + id variable. */ + for (unsigned opr = 0; opr < instr->getNumOperands(); ++opr) + { + llvm::LoadInst *load = + dyn_cast<llvm::LoadInst>(instr->getOperand(opr)); + if (load == NULL) continue; + if (load == LocalIDXLoadInstr || + load == LocalIDYLoadInstr || + load == LocalIDZLoadInstr) continue; + + if (load->getPointerOperand() == localIdZ) + instr->setOperand(opr, LocalIDZLoadInstr); + if (load->getPointerOperand() == localIdY) + instr->setOperand(opr, LocalIDYLoadInstr); + if (load->getPointerOperand() == localIdX) + instr->setOperand(opr, LocalIDXLoadInstr); + } + } + } +} diff --git a/src/llvmopencl/ParallelRegion.h b/src/llvmopencl/ParallelRegion.h new file mode 100644 index 0000000..9313983 --- /dev/null +++ b/src/llvmopencl/ParallelRegion.h @@ -0,0 +1,127 @@ +// Class definition for parallel regions, a group of BasicBlocks that +// each kernel should run in parallel. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_PARALLEL_REGION_H +#define _POCL_PARALLEL_REGION_H + +#include "BarrierBlock.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/BasicBlock.h" +#include "llvm/LLVMContext.h" +#else +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/LLVMContext.h" +#endif +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/ADT/SmallVector.h" +#include <vector> + +namespace pocl { + +#define POCL_LOCAL_ID_X_GLOBAL "_local_id_x" +#define POCL_LOCAL_ID_Y_GLOBAL "_local_id_y" +#define POCL_LOCAL_ID_Z_GLOBAL "_local_id_z" + +class Kernel; + + // TODO Cleanup: this should not inherit vector but contain it. + // It now exposes too much to the clients and leads to hard + // to track errors when the API is changed. + class ParallelRegion : public std::vector<llvm::BasicBlock *> { + public: + typedef llvm::SmallVector<ParallelRegion *, 8> ParallelRegionVector; + + ParallelRegion(int forcedRegionId=-1); + + /* BarrierBlock *getEntryBarrier(); */ + ParallelRegion *replicate(llvm::ValueToValueMapTy &map, + const llvm::Twine &suffix); + void remap(llvm::ValueToValueMapTy &map); + void purge(); + void chainAfter(ParallelRegion *region); + void insertPrologue(unsigned x, unsigned y, unsigned z); + static void insertLocalIdInit(llvm::BasicBlock* entry, + unsigned x, + unsigned y, + unsigned z); + void dump(); + void dumpNames(); + void setEntryBBIndex(std::size_t index) { entryIndex_ = index; } + void setExitBBIndex(std::size_t index) { exitIndex_ = index; } + void SetExitBB(llvm::BasicBlock *block); + void AddBlockBefore(llvm::BasicBlock *block, llvm::BasicBlock *before); + void AddBlockAfter(llvm::BasicBlock *block, llvm::BasicBlock *after); + + llvm::BasicBlock* exitBB() { return at(exitIndex_); } + llvm::BasicBlock* entryBB() { return at(entryIndex_); } + void AddIDMetadata(llvm::LLVMContext& context, + std::size_t x = 0, + std::size_t y = 0, + std::size_t z = 0); + + void AddParallelLoopMetadata(llvm::MDNode *identifier); + + bool HasBlock(llvm::BasicBlock *bb); + + void InjectRegionPrintF(); + void InjectVariablePrintouts(); + + void InjectPrintF + (llvm::Instruction *before, std::string formatStr, + std::vector<llvm::Value*>& params); + + static ParallelRegion * + Create(const llvm::SmallPtrSet<llvm::BasicBlock *, 8>& bbs, + llvm::BasicBlock *entry, llvm::BasicBlock *exit); + + static void GenerateTempNames(llvm::BasicBlock *bb); + + llvm::Instruction* LocalIDXLoad(); + llvm::Instruction* LocalIDYLoad(); + llvm::Instruction* LocalIDZLoad(); + + void LocalizeIDLoads(); + + private: + llvm::Instruction* LocalIDXLoadInstr; + llvm::Instruction* LocalIDYLoadInstr; + llvm::Instruction* LocalIDZLoadInstr; + + bool Verify(); + /// The indices of entry and exit, not pointers, for finding the BBs in the + /// replicated PRs too. + std::size_t exitIndex_; + std::size_t entryIndex_; + + /// Identifier for the parallel region. + int pRegionId; + static int idGen; + + }; + +} + +#endif diff --git a/src/llvmopencl/TargetAddressSpaces.cc b/src/llvmopencl/TargetAddressSpaces.cc new file mode 100644 index 0000000..bd860cc --- /dev/null +++ b/src/llvmopencl/TargetAddressSpaces.cc @@ -0,0 +1,220 @@ +// Header for TargetAddressSpaces +// +// Copyright (c) 2013 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include <iostream> +#include <string> + +#ifdef LLVM_3_2 +# include <llvm/Instructions.h> +#else +# include <llvm/IR/Instructions.h> +# include <llvm/IR/Module.h> + +#endif +#include <llvm/Transforms/Utils/ValueMapper.h> +#include "llvm/Transforms/Utils/Cloning.h" + +#include "TargetAddressSpaces.h" +#include "Workgroup.h" +#include "LLVMUtils.h" +#include "pocl.h" + +#define DEBUG_TARGET_ADDRESS_SPACES + +namespace pocl { + +using namespace llvm; + +namespace { + static + RegisterPass<pocl::TargetAddressSpaces> X + ("target-address-spaces", + "Convert the 'fake' address space ids to the target specific ones."); +} + +char TargetAddressSpaces::ID = 0; + +TargetAddressSpaces::TargetAddressSpaces() : ModulePass(ID) { +} + +static Type * +ConvertedType(llvm::Type *type, std::map<unsigned, unsigned> &addrSpaceMap) { + + if (type->isPointerTy()) { + unsigned AS = type->getPointerAddressSpace(); + unsigned newAS = addrSpaceMap[AS]; + return PointerType::get(ConvertedType(type->getPointerElementType(), addrSpaceMap), newAS); + } else if (type->isArrayTy()) { + return ArrayType::get + (ConvertedType(type->getArrayElementType(), addrSpaceMap), type->getArrayNumElements()); + } else { /* TODO: pointers inside structs */ + return type; + } +} + +static bool +UpdateAddressSpace(llvm::Value& val, std::map<unsigned, unsigned> &addrSpaceMap) { + Type *type = val.getType(); + if (!type->isPointerTy()) return false; + + Type *newType = ConvertedType(type, addrSpaceMap); + if (newType == type) return false; + + val.mutateType(newType); + return true; +} + + +bool +TargetAddressSpaces::runOnModule(llvm::Module &M) { + + std::string triple = M.getTargetTriple(); + std::string arch = triple; + size_t dash = triple.find("-"); + if (dash != std::string::npos) { + arch = triple.substr(0, dash); + } + + std::map<unsigned, unsigned> addrSpaceMap; + + if (arch == "x86_64") { + /* For x86_64 the default isel seems to work with the + fake address spaces. Skip the processing as it causes + an overhead and is not fully implemented. + */ + return false; + } else if (arch == "tce") { + /* TCE requires the remapping. */ + addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] = 3; + addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] = 4; + /* LLVM 3.2 detects 'constant' as cuda_constant (5) in the fake + address space map. Add it for compatibility. */ + addrSpaceMap[5] = addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 5; + + } else { + /* Assume the fake address space map works directly in case not + overridden here. */ + return false; + } + + bool changed = false; + /* Handle global variables. */ + llvm::Module::global_iterator globalI = M.global_begin(); + llvm::Module::global_iterator globalE = M.global_end(); + for (; globalI != globalE; ++globalI) { + llvm::Value &global = *globalI; + changed |= UpdateAddressSpace(global, addrSpaceMap); + } + + FunctionMapping funcReplacements; + std::vector<llvm::Function*> unhandledFuncs; + + /* Collect the functions to process first because we add + a new function per modified function which invalidates + the Module's function iterator. */ + for (llvm::Module::iterator functionI = M.begin(), functionE = M.end(); + functionI != functionE; ++functionI) { + if (functionI->empty() || functionI->getName().startswith("_GLOBAL")) + continue; + unhandledFuncs.push_back(functionI); + } + + for (std::vector<llvm::Function*>::iterator i = unhandledFuncs.begin(), + e = unhandledFuncs.end(); i != e; ++i) { + llvm::Function &F = **i; + + /* Convert the FunctionType. Because there is no mutator API in + LLVM for this, we need to recreate the whole darn function :( */ + SmallVector<Type *, 8> parameters; + for (Function::const_arg_iterator i = F.arg_begin(), + e = F.arg_end(); + i != e; ++i) + parameters.push_back(ConvertedType(i->getType(), addrSpaceMap)); + + llvm::FunctionType *ft = FunctionType::get + (ConvertedType(F.getReturnType(), addrSpaceMap), + parameters, F.isVarArg()); + + llvm::Function *newFunc = Function::Create(ft, F.getLinkage(), "", &M); + newFunc->takeName(&F); + + ValueToValueMapTy vv; + Function::arg_iterator j = newFunc->arg_begin(); + for (Function::const_arg_iterator i = F.arg_begin(), + e = F.arg_end(); + i != e; ++i) { + j->setName(i->getName()); + vv[i] = j; + ++j; + } + + SmallVector<ReturnInst *, 1> ri; + + class AddressSpaceReMapper : public ValueMapTypeRemapper { + public: + AddressSpaceReMapper(std::map<unsigned, unsigned> &addrSpaceMap) : + addrSpaceMap_(addrSpaceMap) {} + Type* remapType(Type *type) { + Type *newType = ConvertedType(type, addrSpaceMap_); + if (newType == type) return type; + return newType; + } + private: + std::map<unsigned, unsigned>& addrSpaceMap_; + } asvtm(addrSpaceMap); + + CloneFunctionInto(newFunc, &F, vv, true, ri, "", NULL, &asvtm); + funcReplacements[&F] = newFunc; + } + + /* Replace all references to the old function to the new one. */ + llvm::Module::iterator fI = M.begin(); + llvm::Module::iterator fE = M.end(); + for (; fI != fE; ++fI) { + llvm::Function &F = *fI; + for (llvm::Function::iterator bbi = F.begin(), bbe = F.end(); bbi != bbe; + ++bbi) + for (llvm::BasicBlock::iterator ii = bbi->begin(), ie = bbi->end(); ii != ie; + ++ii) { + llvm::Instruction *instr = ii; + if (!isa<CallInst>(instr)) continue; + llvm::CallInst *call = dyn_cast<CallInst>(instr); + llvm::Function *calledF = call->getCalledFunction(); + if (funcReplacements.find(calledF) == funcReplacements.end()) continue; + + call->setCalledFunction(funcReplacements[calledF]); + } + } + + regenerate_kernel_metadata(M, funcReplacements); + + /* Delete the old functions. */ + for (FunctionMapping::iterator i = funcReplacements.begin(), + e = funcReplacements.end(); i != e; ++i) { + i->first->eraseFromParent(); + } + + return true; +} + +} diff --git a/src/llvmopencl/TargetAddressSpaces.h b/src/llvmopencl/TargetAddressSpaces.h new file mode 100644 index 0000000..1a080c8 --- /dev/null +++ b/src/llvmopencl/TargetAddressSpaces.h @@ -0,0 +1,54 @@ +// Header for TargetAddressSpaces, an LLVM pass that converts the +// generic address space ids to the target specific ones. +// +// Copyright (c) 2013 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_TARGET_ADDRESS_SPACES_H +#define _POCL_TARGET_ADDRESS_SPACES_H + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Function.h" +#else +#include "llvm/IR/Function.h" +#endif + +#include "llvm/Pass.h" + +namespace pocl { + /* pocl uses the fixed address space ids forced by the clang's + -ffake-address-space-map internally until the end to be able to + detect the different OpenCL address spaces ambiguously, regardless + of the target. This pass converts the fake address space ids to + the target-specific ones, if required by the code generator of that + target. */ + class TargetAddressSpaces : public llvm::ModulePass { + public: + static char ID; + + TargetAddressSpaces(); + virtual ~TargetAddressSpaces() {}; + + virtual bool runOnModule(llvm::Module &M); + }; +} + +#endif diff --git a/src/llvmopencl/VariableUniformityAnalysis.cc b/src/llvmopencl/VariableUniformityAnalysis.cc new file mode 100644 index 0000000..4362524 --- /dev/null +++ b/src/llvmopencl/VariableUniformityAnalysis.cc @@ -0,0 +1,382 @@ +// Implementation for VariableUniformityAnalysis function pass. +// +// Copyright (c) 2013 Pekka Jääskeläinen / Tampere University of Technology +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include <sstream> +#include <iostream> + +#ifdef LLVM_3_2 +#include "llvm/Metadata.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/Instructions.h" +#include "llvm/ValueSymbolTable.h" +#include "llvm/DataLayout.h" +#else +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/IR/DataLayout.h" +#endif +#include "llvm/Support/CommandLine.h" +#include "llvm/Analysis/PostDominators.h" + +#include "WorkitemHandler.h" +#include "Kernel.h" +#include "VariableUniformityAnalysis.h" +#include "Barrier.h" + +//#define DEBUG_UNIFORMITY_ANALYSIS + +namespace pocl { + +char VariableUniformityAnalysis::ID = 0; + +using namespace llvm; + +static +RegisterPass<VariableUniformityAnalysis> X( + "uniformity", + "Analyses the variables of the function for uniformity (same value across WIs).", + false, false); + +VariableUniformityAnalysis::VariableUniformityAnalysis() : FunctionPass(ID) { +} + + +void +VariableUniformityAnalysis::getAnalysisUsage(llvm::AnalysisUsage &AU) const { + AU.addRequired<PostDominatorTree>(); + AU.addPreserved<PostDominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + // required by LoopInfo: + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + +// TODO This was turned off because of compilation error +#if 0 +#ifdef LLVM_3_1 + AU.addRequired<TargetData>(); + AU.addPreserved<TargetData>(); +#else + AU.addRequired<DataLayout>(); + AU.addPreserved<DataLayout>(); +#endif +#endif +} + +bool +VariableUniformityAnalysis::runOnFunction(Function &F) { + + /* Do the actual analysis on-demand except for the basic block + divergence analysis. */ + uniformityCache_[&F].clear(); + + /* Mark the canonican induction variable PHI as uniform. + If there's a canonical induction variable in loops, the variable + update for each iteration should be uniform. Note: this does not yet imply + all the work-items execute the loop same number of times! */ + llvm::LoopInfo &LI = getAnalysis<LoopInfo>(); + for (llvm::LoopInfo::iterator i = LI.begin(), e = LI.end(); i != e; ++i) { + llvm::Loop *L = *i; + if (llvm::PHINode *inductionVar = L->getCanonicalInductionVariable()) { +#ifdef DEBUG_UNIFORMITY_ANALYSIS + std::cerr << "### canonical induction variable, assuming uniform:"; + inductionVar->dump(); +#endif + setUniform(&F, inductionVar); + } + } + + setUniform(&F, &F.getEntryBlock()); + analyzeBBDivergence(&F, &F.getEntryBlock(), &F.getEntryBlock()); + // F.viewCFG(); + return false; +} + +/** + * BB divergence analysis. + * + * Define: + * Uniform BB. A basic block which is known to be executed by all or none + * of the work-items, that is, a BB where it's known safe to add a barrier. + * + * Divergent/varying BB. A basic block where work-items *might* diverge. + * That is, it cannot be proven that all work-items execute the BB. + * + * Propagate the information from the entry downwards (breadth first). + * This avoids infinite recursion with loop back edges and enables + * to keep book of the "last seen" uniform BB. + * + * The conditions to mark a BB 'uniform': + * + * a) the function entry + * b) BBs that post-dominate at least one uniform BB (try the previously + * found one) + * c) BBs that are branched to directly from a uniform BB using a uniform branch. + * + * Otherwise, assume divergent (might not be *proven* to be one!). + * + */ +void +VariableUniformityAnalysis::analyzeBBDivergence +(llvm::Function *f, llvm::BasicBlock *bb, llvm::BasicBlock *previousUniformBB) { + + + llvm::BasicBlock *newPreviousUniformBB = previousUniformBB; + + llvm::BranchInst *br = + dyn_cast<llvm::BranchInst>(previousUniformBB->getTerminator()); + + if (br == NULL) { + // this is most likely a function with a single basic block, the entry node, which + // ends with a ret + return; + } + + // Condition c) + if ((!br->isConditional() || isUniform(f, br->getCondition()))) { + for (unsigned suc = 0, end = br->getNumSuccessors(); suc < end; ++suc) { + if (br->getSuccessor(suc) == bb) { + setUniform(f, bb, true); + newPreviousUniformBB = bb; + break; + } + } + } + + // Condition b) + if (newPreviousUniformBB != bb) { + llvm::PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>(); + if (PDT->dominates(bb, previousUniformBB)) { + setUniform(f, bb, true); + newPreviousUniformBB = bb; + } + } + + /* Assume diverging. */ + if (!isUniformityAnalyzed(f, bb)) + setUniform(f, bb, false); + + llvm::BranchInst *nextbr = dyn_cast<llvm::BranchInst>(bb->getTerminator()); + + if (nextbr == NULL) return; /* ret */ + + /* Propagate the data downward. */ + for (unsigned suc = 0, end = nextbr->getNumSuccessors(); suc < end; ++suc) { + llvm::BasicBlock *nextbb = nextbr->getSuccessor(suc); + if (!isUniformityAnalyzed(f, nextbb)) { + analyzeBBDivergence(f, nextbb, newPreviousUniformBB); + } + } +} + +bool +VariableUniformityAnalysis::isUniformityAnalyzed(llvm::Function *f, llvm::Value *v) const { + UniformityIndex &cache = uniformityCache_[f]; + UniformityIndex::const_iterator i = cache.find(v); + if (i != cache.end()) { + return true; + } + return false; +} + +/** + * Simple uniformity analysis that recursively analyses all the + * operands affecting the value. + * + * Known uniform Values: + * a) kernel arguments + * b) constants + * + */ +bool +VariableUniformityAnalysis::isUniform(llvm::Function *f, llvm::Value* v) { + + UniformityIndex &cache = uniformityCache_[f]; + UniformityIndex::const_iterator i = cache.find(v); + if (i != cache.end()) { + return (*i).second; + } + + if (llvm::BasicBlock *bb = dyn_cast<llvm::BasicBlock>(v)) { + if (bb == &f->getEntryBlock()) { + setUniform(f, v, true); + return true; + } + } + + if (isa<llvm::Argument>(v)) { + setUniform(f, v, true); + return true; + } + + if (isa<llvm::ConstantInt>(v)) { + setUniform(f, v, true); + return true; + } + + if (isa<llvm::AllocaInst>(v)) { + /* Allocas might or might not be divergent. These are produced + from work-item private arrays or the PHIsToAllocas. It depends + what is written to them whether they are really divergent. + + We need to figure out if any of the stores to the alloca contain + work-item id dependent data. Take a white listing approach that + detects the ex-phi allocas of loop iteration variables of non-diverging + loops. + + Currently the following case is white listed: + a) are scalars + b) are accesses only with load and stores (e.g. address not taken) + c) stored data is uniform + + Because alloca data can be modified in loops and thus be dependent on + itself, we need a bit involved mechanism to handle it. First create + a copy of the uniformity cache, then assume the alloca itself is uniform, + then check if all the stores to the alloca contain uniform data. If + our initial assumption was wrong, restore the cache from the backup. + */ + UniformityCache backupCache(uniformityCache_); + setUniform(f, v); + + bool isUniformAlloca = true; + llvm::Instruction *instruction = dyn_cast<llvm::AllocaInst>(v); + for (Instruction::use_iterator ui = instruction->use_begin(), + ue = instruction->use_end(); + ui != ue; ++ui) { + Instruction *user; + if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue; + + llvm::StoreInst *store = dyn_cast<llvm::StoreInst>(user); + if (store) { + if (!isUniform(f, store->getValueOperand())) { + isUniformAlloca = false; + break; + } + } else if (dyn_cast<llvm::LoadInst>(user) != NULL) { + } else { +#ifdef DEBUG_UNIFORMITY_ANALYSIS + std::cerr << "### alloca has a suspicious user" << std::endl; + user->dump(); +#endif + isUniformAlloca = false; + break; + } + } + + if (!isUniformAlloca) { + // restore the old uniform data as our guess was wrong + uniformityCache_ = backupCache; + } + setUniform(f, v, isUniformAlloca); + + return isUniformAlloca; + } + + /* TODO: global memory loads are uniform in case they are accessing + the higher scope ids (group_id_?). */ + if (isa<llvm::LoadInst>(v)) { + llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(v); + llvm::Value *pointer = load->getPointerOperand(); + llvm::Module *M = load->getParent()->getParent()->getParent(); + + if (pointer == M->getGlobalVariable("_group_id_x") || + pointer == M->getGlobalVariable("_group_id_y") || + pointer == M->getGlobalVariable("_group_id_z") || + pointer == M->getGlobalVariable("_work_dim") || + pointer == M->getGlobalVariable("_num_groups_x") || + pointer == M->getGlobalVariable("_num_groups_y") || + pointer == M->getGlobalVariable("_num_groups_z") || + pointer == M->getGlobalVariable("_global_offset_x") || + pointer == M->getGlobalVariable("_global_offset_y") || + pointer == M->getGlobalVariable("_global_offset_z") || + pointer == M->getGlobalVariable("_local_size_x") || + pointer == M->getGlobalVariable("_local_size_y") || + pointer == M->getGlobalVariable("_local_size_z")) { + + setUniform(f, v, true); + return true; + } + } + + if (isa<llvm::PHINode>(v)) { + /* TODO: PHINodes need control flow analysis: + even if the values are uniform, the selected + value depends on the preceeding basic block which + might depend on the ID. Assume they are not uniform + for now in general and treat the loop iteration + variable as a special case (set externally from a LoopPass). + + TODO: PHINodes can depend (indirectly or directly) on itself in loops + so it would need infinite recursion checking. + */ + setUniform(f, v, false); + return false; + } + + llvm::Instruction *instr = dyn_cast<llvm::Instruction>(v); + if (instr == NULL) { + setUniform(f, v, false); + return false; + } + // not computed previously, scan all operands of the instruction + // and figure out their uniformity recursively + for (unsigned opr = 0; opr < instr->getNumOperands(); ++opr) { + llvm::Value *operand = instr->getOperand(opr); + if (!isUniform(f, operand)) { + setUniform(f, v, false); + return false; + } + } + setUniform(f, v, true); + return true; +} + +void +VariableUniformityAnalysis::setUniform(llvm::Function *f, + llvm::Value *v, + bool isUniform) { + + UniformityIndex &cache = uniformityCache_[f]; + cache[v] = isUniform; + +#ifdef DEBUG_UNIFORMITY_ANALYSIS + std::cerr << "### "; + if (isUniform) + std::cerr << "uniform "; + else + std::cerr << "varying "; + + if (isa<llvm::BasicBlock>(v)) { + std::cerr << "BB: " << v->getName().str() << std::endl; + } else { + v->dump(); + } +#endif +} + +} diff --git a/src/llvmopencl/VariableUniformityAnalysis.h b/src/llvmopencl/VariableUniformityAnalysis.h new file mode 100644 index 0000000..88175a8 --- /dev/null +++ b/src/llvmopencl/VariableUniformityAnalysis.h @@ -0,0 +1,70 @@ +// Header for VariableUniformityAnalysis function pass. +// +// Copyright (c) 2013 Pekka Jääskeläinen / Tampere University of Technology +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef POCL_VARIABLE_UNIFORMITY_ANALYSIS_H +#define POCL_VARIABLE_UNIFORMITY_ANALYSIS_H + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Function.h" +#else +#include "llvm/IR/Function.h" +#endif + +#include "llvm/Pass.h" + +namespace pocl { + /** + * Analyses the variables in the function to figure out if a variable + * value is + * + * a) 'uniform', i.e., always same for all work-items in the *same work-group* + * b) 'varying', i.e., somehow dependent on the work-item id + * + * For safety, 'variable' is assumed, unless certain of a). + */ + class VariableUniformityAnalysis : public llvm::FunctionPass { + public: + static char ID; + + VariableUniformityAnalysis(); + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnFunction(llvm::Function &F); + virtual bool isUniform(llvm::Function *f, llvm::Value* v); + virtual void setUniform(llvm::Function *f, llvm::Value *v, bool isUniform=true); + virtual void analyzeBBDivergence(llvm::Function *f, + llvm::BasicBlock *bb, + llvm::BasicBlock *previousUniformBB); + + private: + + bool isUniformityAnalyzed(llvm::Function *f, llvm::Value *val) const; + + typedef std::map<llvm::Value*, bool> UniformityIndex; + typedef std::map<llvm::Function *, UniformityIndex> UniformityCache; + mutable UniformityCache uniformityCache_; + + }; +} + +#endif diff --git a/src/llvmopencl/WIVectorize.cc b/src/llvmopencl/WIVectorize.cc new file mode 100644 index 0000000..e234392 --- /dev/null +++ b/src/llvmopencl/WIVectorize.cc @@ -0,0 +1,3252 @@ +//===- WIVectorize.cpp - A Work Item Vectorizer -------------------------===// +// +// This code has been adapted from BBVectorize of the LLVM project. +// The original file comment: +// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// This file implements a basic-block vectorization pass. The algorithm was +// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, +// et al. It works by looking for chains of pairable operations and then +// pairing them. +// +//===----------------------------------------------------------------------===// +// +// WIVectorize: +// +// Additional options are provided to vectorize only candidate from differnt +// work items according to metadata provided by 'pocl' frontend +// (launchpad.net/pocl). +// +// Additional option is also available to vectorize loads and stores only. +// Still work in progress by vladimir guzma [at] tut fi. +// +//===----------------------------------------------------------------------===// + +#define WIV_NAME "wi-vectorize" +#define DEBUG_TYPE WIV_NAME +#include "config.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/TypeBuilder.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Type.h" +#include "llvm/Metadata.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/TypeBuilder.h" +#include "llvm/DataLayout.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Type.h" +#include "llvm/Metadata.h" +#include "llvm/TargetTransformInfo.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#endif +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Transforms/Vectorize.h" +#include <algorithm> +#include <map> +#include <iostream> +using namespace llvm; + +static cl::opt<bool> +IgnoreTargetInfo("wi-vectorize-ignore-target-info", cl::init(true), + cl::Hidden, cl::desc("Ignore target information")); + +static cl::opt<unsigned> +ReqChainDepth("wi-vectorize-req-chain-depth", cl::init(3), cl::Hidden, + cl::desc("The required chain depth for vectorization")); + +static cl::opt<unsigned> +VectorWidth("wi-vectorize-vector-width", cl::init(8), cl::Hidden, + cl::desc("The width of the machine vector in words.")); + +static cl::opt<bool> +NoMath("wi-vectorize-no-math", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point math intrinsics")); + +static cl::opt<bool> +NoFMA("wi-vectorize-no-fma", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); + +static cl::opt<bool> +NoMemOps("wi-vectorize-no-mem-ops", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize loads and stores")); + +static cl::opt<bool> +AlignedOnly("wi-vectorize-aligned-only", cl::init(false), cl::Hidden, + cl::desc("Only generate aligned loads and stores")); + +static cl::opt<bool> +MemOpsOnly("wi-vectorize-mem-ops-only", cl::init(false), cl::Hidden, + cl::desc("Try to vectorize loads and stores only")); + +static cl::opt<bool> +NoFP("wi-vectorize-no-fp", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point operations")); + +static cl::opt<bool> +NoCMP("wi-vectorize-no-cmp", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize comparison operations")); + +static cl::opt<bool> +NoCount("wi-vectorize-no-counters", cl::init(false), cl::Hidden, + cl::desc("Forbid vectorization based no loop counter " + "arithmetic")); +static cl::opt<bool> +NoGEP("wi-vectorize-no-GEP", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize getelementpointer operations")); + +#ifndef NDEBUG +static cl::opt<bool> +DebugInstructionExamination("wi-vectorize-debug-instruction-examination", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " instruction-examination process")); +static cl::opt<bool> +DebugCandidateSelection("wi-vectorize-debug-candidate-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " candidate-selection process")); +static cl::opt<bool> +DebugPairSelection("wi-vectorize-debug-pair-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " pair-selection process")); +static cl::opt<bool> +DebugCycleCheck("wi-vectorize-debug-cycle-check", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " cycle-checking process")); +#endif + +STATISTIC(NumFusedOps, "Number of operations fused by wi-vectorize"); + +namespace llvm { + FunctionPass* createWIVectorizePass(); +} +namespace { + struct WIVectorize : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + WIVectorize() : FunctionPass(ID) {} + + typedef std::pair<Value *, Value *> ValuePair; + typedef std::pair<ValuePair, size_t> ValuePairWithDepth; + typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair + typedef std::pair<std::multimap<Value *, Value *>::iterator, + std::multimap<Value *, Value *>::iterator> VPIteratorPair; + typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, + std::multimap<ValuePair, ValuePair>::iterator> + VPPIteratorPair; + typedef std::vector<Value *> ValueVector; + typedef DenseMap<Value*, ValueVector*> ValueVectorMap; + + AliasAnalysis *AA; + ScalarEvolution *SE; +#ifdef LLVM_3_1 + TargetData *TD; +#elif defined LLVM_3_2 + DataLayout *TD; + TargetTransformInfo *TTI; + const VectorTargetTransformInfo *VTTI; +#else + DataLayout *TD; + TargetTransformInfo *TTI; + const TargetTransformInfo *VTTI; +#endif + DenseMap<Value*, Value*> storedSources; + DenseMap<std::pair<int,int>, ValueVector*> stridedOps; + std::multimap<Value*, Value*> flippedStoredSources; + // FIXME: const correct? + + bool vectorizePairs(BasicBlock &BB); + + bool vectorizePhiNodes(BasicBlock &BB); + + bool vectorizeAllocas(BasicBlock& BB); + + void replaceUses(BasicBlock& BB, + AllocaInst& oldAlloca, + AllocaInst& newAlloca, + int indx); + + Type* newAllocaType(Type* start, unsigned int width); + + bool removeDuplicates(BasicBlock &BB); + + void dropUnused(BasicBlock& BB); + + bool getCandidatePairs(BasicBlock &BB, + BasicBlock::iterator &Start, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts); + + bool getCandidateAllocas(BasicBlock &BB, + std::multimap<int, ValueVector *>& candidateAllocas); + + void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs); + + void buildDepMap(BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &PairableInstUsers); + + void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *>& ChosenPairs); + + void fuseChosenPairs(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *>& ChosenPairs); + + bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); + + bool areInstsCompatible(Instruction *I, Instruction *J, + bool IsSimpleLoadStore); + + bool areInstsCompatibleFromDifferentWi(Instruction *I, Instruction *J); + + bool trackUsesOfI(DenseSet<Value *> &Users, + AliasSetTracker &WriteSet, Instruction *I, + Instruction *J, bool UpdateUsers = true, + std::multimap<Value *, Value *> *LoadMoveSet = 0); + + void computePairsConnectedTo( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + ValuePair P); + + bool pairsConflict(ValuePair P, ValuePair Q, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); + + bool pairWillFormCycle(ValuePair P, + std::multimap<ValuePair, ValuePair> &PairableInstUsers, + DenseSet<ValuePair> &CurrentPairs); + + void pruneTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &Tree, + DenseSet<ValuePair> &PrunedTree, ValuePair J, + bool UseCycleCheck); + + void buildInitialTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &Tree, ValuePair J); + + void findBestTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, + size_t &BestEffSize, VPIteratorPair ChoiceRange, + bool UseCycleCheck); + + Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool FlipMemInputs); + + void fillNewShuffleMask(LLVMContext& Context, Instruction *J, + unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, + unsigned IdxOffset, std::vector<Constant*> &Mask); + + Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, + Instruction *J); + + Value *getReplacementInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool FlipMemInputs); + + Value* CommonShuffleSource(Instruction *I, Instruction *J); + + void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, + Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, + bool FlipMemInputs); + + void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, + Instruction *J, Instruction *K, + Instruction *&InsertionPt, Instruction *&K1, + Instruction *&K2, bool FlipMemInputs); + + void collectPairLoadMoveSet(BasicBlock &BB, + DenseMap<Value *, Value *> &ChosenPairs, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *I); + + void collectLoadMoveSet(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + std::multimap<Value *, Value *> &LoadMoveSet); + + void moveUsesOfIAfterJ(BasicBlock &BB, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *&InsertionPt, + Instruction *I, Instruction *J); + + void collectPtrInfo(std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<Value *> &LowPtrInsts); + + bool doInitialization(Module& /*m*/) { + return false; + } + bool doFinalization(Module& /*m*/) { + return false; + } + virtual bool runOnFunction(Function &Func) { + + AA = &getAnalysis<AliasAnalysis>(); + SE = &getAnalysis<ScalarEvolution>(); +#ifdef LLVM_3_1 + TD = getAnalysisIfAvailable<TargetData>(); +#elif defined LLVM_3_2 + TD = getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : + getAnalysisIfAvailable<TargetTransformInfo>(); + VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; +#else + TD = getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : + getAnalysisIfAvailable<TargetTransformInfo>(); + VTTI = TTI; +#endif + + bool changed = false; + for (Function::iterator i = Func.begin(); + i != Func.end(); i++) { + changed |=runOnBasicBlock(*i); + } + return changed; + } + + virtual bool runOnBasicBlock(BasicBlock &BB) { + + bool changed = false; + + // First try to create vectors of all allocas, if there are any + changed |= vectorizeAllocas(BB); + // Iterate a sufficient number of times to merge types of size 1 bit, + // then 2 bits, then 4, etc. up to half of the target vector width of the + // target vector register. + bool vectorizeTwice = false; + + + // There are 3 possible cases of vectorization in regards to memory + // operations: + // 1: Explicitly forbid vectorization of mem ops (NoMemOps) + // 2: Allow only vectorization of mem ops (MemOpsOnly) + // 3: Vectorize mem ops as well as everything else + // In cases 1 and 2, following test makes sure vectorization is + // run only once. + // For case 3, we first run vectorization of memory operations only + // and then we run vectorization of everything else. In between + // we remove unused operations, which are typicaly memory + // access computations that are not needed anymore and their vectorization + // is waste of resources. Instruction combiner is not able to get rid + // of those on it's own once they are in vectors. + + // Store original values of two variables. They can be changed bellow + // but have to be restored before calling this for next BB. + bool originalMemOpsOnly = MemOpsOnly; + bool originalNoMemOps = NoMemOps; + if (!MemOpsOnly && !NoMemOps) { + MemOpsOnly = true; + vectorizeTwice = true; + } +#if 0 +#ifdef LLVM_3_3 + if (TTI) { + std::cerr << " settign new vector width" << std::endl; + unsigned WidestRegister = TTI->getRegisterBitWidth(true); + VectorWidth = WidestRegister/32; + std::cerr << VectorWidth << std::endl; + } +#endif +#endif + + for (unsigned v = 2, n = 1; v <= VectorWidth; + v *= 2, ++n) { + DEBUG(dbgs() << "WIV: fusing memm only in loop #" << n << + " for " << BB.getName() << " in " << + BB.getParent()->getName() << "...\n"); + if (vectorizePairs(BB)) { + dropUnused(BB); + changed = true; + } + else + break; + } + if (vectorizeTwice) { + MemOpsOnly = false; + NoMemOps = true; + for (unsigned v = 2, n = 1; v <= VectorWidth; + v *= 2, ++n) { + DEBUG(dbgs() << "WIV: fusing loop #" << n << + " for " << BB.getName() << " in " << + BB.getParent()->getName() << "...\n"); + if (vectorizePairs(BB)) { + dropUnused(BB); + changed = true; + } + else + break; + } + } + + if (changed) { + vectorizePhiNodes(BB); + removeDuplicates(BB); + } + + DEBUG(dbgs() << "WIV: done!\n"); + MemOpsOnly = originalMemOpsOnly; + NoMemOps = originalNoMemOps; + return changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<ScalarEvolution>(); + AU.setPreservesCFG(); + } + // This returns the vector type that holds a pair of the provided type. + // If the provided type is already a vector, then its length is doubled. + static inline VectorType *getVecTypeForVector(Type *ElemTy) { + if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { + unsigned numElem = VTy->getNumElements(); + return VectorType::get(ElemTy->getScalarType(), numElem*VectorWidth); + } else { + return VectorType::get(ElemTy->getScalarType(), VectorWidth); + + } + + return VectorType::get(ElemTy, 2); + } + // This returns the vector type that holds a pair of the provided type. + // If the provided type is already a vector, then its length is doubled. + static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { + assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && + "Cannot form vector from incompatible scalar types"); + Type *STy = ElemTy->getScalarType(); + + unsigned numElem; + if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { + numElem = VTy->getNumElements(); + } else { + numElem = 1; + } + + if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { + numElem += VTy->getNumElements(); + } else { + numElem += 1; + } + + return VectorType::get(STy, numElem); + } + + std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, + unsigned n = 0) { + if (!I->hasName()) + return ""; + + return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + + (n > 0 ? "." + utostr(n) : "")).str(); + } + + // Returns the weight associated with the provided value. A chain of + // candidate pairs has a length given by the sum of the weights of its + // members (one weight per pair; the weight of each member of the pair + // is assumed to be the same). This length is then compared to the + // chain-length threshold to determine if a given chain is significant + // enough to be vectorized. The length is also used in comparing + // candidate chains where longer chains are considered to be better. + // Note: when this function returns 0, the resulting instructions are + // not actually fused. + static inline size_t getDepthFactor(Value *V) { + // InsertElement and ExtractElement have a depth factor of zero. This is + // for two reasons: First, they cannot be usefully fused. Second, because + // the pass generates a lot of these, they can confuse the simple metric + // used to compare the trees in the next iteration. Thus, giving them a + // weight of zero allows the pass to essentially ignore them in + // subsequent iterations when looking for vectorization opportunities + // while still tracking dependency chains that flow through those + // instructions. + if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) + return 0; + + // Give a load or store half of the required depth so that load/store + // pairs will vectorize. + if ((isa<LoadInst>(V) || isa<StoreInst>(V))) + return ReqChainDepth; + + return 1; + } + // Returns the cost of the provided instruction using VTTI. + // This does not handle loads and stores. + unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { +#ifdef LLVM_3_1 + return 1; +#else + switch (Opcode) { + default: break; + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector GEPs. + return 0; + case Instruction::Br: + return VTTI->getCFInstrCost(Opcode); + case Instruction::PHI: + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return VTTI->getArithmeticInstrCost(Opcode, T1); + case Instruction::Select: + case Instruction::ICmp: + case Instruction::FCmp: + return VTTI->getCmpSelInstrCost(Opcode, T1, T2); + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::ShuffleVector: + return VTTI->getCastInstrCost(Opcode, T1, T2); + } + return 1; +#endif + } + // This determines the relative offset of two loads or stores, returning + // true if the offset could be determined to be some constant value. + // For example, if OffsetInElmts == 1, then J accesses the memory directly + // after I; if OffsetInElmts == -1 then I accesses the memory + // directly after J. This function assumes that both instructions + // have the same type. + bool getPairPtrInfo(Instruction *I, Instruction *J, + Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, + unsigned &IAddressSpace, unsigned &JAddressSpace, + int64_t &OffsetInElmts) { + OffsetInElmts = 0; + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + LoadInst *LJ = cast<LoadInst>(J); + IPtr = LI->getPointerOperand(); + JPtr = LJ->getPointerOperand(); + IAlignment = LI->getAlignment(); + JAlignment = LJ->getAlignment(); + IAddressSpace = LI->getPointerAddressSpace(); + JAddressSpace = LJ->getPointerAddressSpace(); + } else if (isa<GetElementPtrInst>(I)) { + Instruction::op_iterator it = cast<GetElementPtrInst>(I)->idx_begin(); + IPtr = *it; + Instruction::op_iterator jt = cast<GetElementPtrInst>(J)->idx_begin(); + JPtr = *jt; + if (!IPtr || !JPtr) + return false; + IAlignment = 0; + JAlignment = 0; + } else { + StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); + IPtr = SI->getPointerOperand(); + JPtr = SJ->getPointerOperand(); + IAlignment = SI->getAlignment(); + JAlignment = SJ->getAlignment(); + IAddressSpace = SI->getPointerAddressSpace(); + JAddressSpace = SJ->getPointerAddressSpace(); + } + if ((isa<GetElementPtrInst>(I) && !SE->isSCEVable(IPtr->getType())) + || (isa<GetElementPtrInst>(J) && !SE->isSCEVable(JPtr->getType()))) { + // Asume, the getelementpointer is already vector, so the pointer + // operand is also the vector and LLVM scalar evaluation can + // not understand it. + OffsetInElmts = 2; + return true; + } + const SCEV *IPtrSCEV = SE->getSCEV(IPtr); + const SCEV *JPtrSCEV = SE->getSCEV(JPtr); + + // If this is a trivial offset, then we'll get something like + // 1*sizeof(type). With target data, which we need anyway, this will get + // constant folded into a number. + const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); + if (const SCEVConstant *ConstOffSCEV = + dyn_cast<SCEVConstant>(OffsetSCEV)) { + ConstantInt *IntOff = ConstOffSCEV->getValue(); + int64_t Offset = IntOff->getSExtValue(); + if (isa<GetElementPtrInst>(I)) { + OffsetInElmts = Offset; + return (abs64(Offset)) > 1; + } + Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); + int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); + + Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType(); + if (VTy != VTy2 && Offset < 0) { + int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); + OffsetInElmts = Offset/VTy2TSS; + return (abs64(Offset) % VTy2TSS) == 0; + } + OffsetInElmts = Offset/VTyTSS; + + return (abs64(Offset) % VTyTSS) == 0; + } + return false; + } + + // Returns true if the provided CallInst represents an intrinsic that can + // be vectorized. + bool isVectorizableIntrinsic(CallInst* I) { + Function *F = I->getCalledFunction(); + if (!F) return false; + + unsigned IID = F->getIntrinsicID(); + if (!IID) return false; + + switch(IID) { + default: + return false; + case Intrinsic::sqrt: + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + return !NoMath; + case Intrinsic::fma: + return !NoFMA; + } + } + + // Returns true if J is the second element in some pair referenced by + // some multimap pair iterator pair. + template <typename V> + bool isSecondInIteratorPair(V J, std::pair< + typename std::multimap<V, V>::iterator, + typename std::multimap<V, V>::iterator> PairRange) { + for (typename std::multimap<V, V>::iterator K = PairRange.first; + K != PairRange.second; ++K) + if (K->second == J) return true; + + return false; + } + }; + // In some cases, instructions did not get combined correctly by previous passes. + // For example with large number of replicated work items, scalar load of constant + // happened for first work item and then exactly same load in 15 and 30th work item. + // The work items in between reused the previous value. + // Also, the vectorization vectorization leads to situations where scalar value + // needs to be replicated to create vector, however, separate vectors were + // created each time the value was to be used. + // This fixes that by search for exactly same Instructions, with same type + // and exactly same parameters and removing later one of them, replacing + // all uses with former. + bool WIVectorize::removeDuplicates(BasicBlock &BB) { + BasicBlock::iterator Start = BB.getFirstInsertionPt(); + BasicBlock::iterator End = BB.end(); + for (BasicBlock::iterator I = Start; I != End; ++I) { + BasicBlock::iterator J = llvm::next(I); + + for ( ; J != End; ) { + + if (isa<AllocaInst>(I) || !I->isIdenticalTo(J)) { + J = llvm::next(J); + continue; + } else { + J->replaceAllUsesWith(I); + AA->replaceWithNewValue(J, I); + SE->forgetValue(J); + BasicBlock::iterator K = llvm::next(J); + J->eraseFromParent(); + J = K; + } + } + } + + return false; + } + // Replace phi nodes of individual valiables with vector they originated + // from. + bool WIVectorize::vectorizePhiNodes(BasicBlock &BB) { + BasicBlock::iterator Start = BB.begin(); + BasicBlock::iterator End = BB.getFirstInsertionPt(); + + ValueVectorMap valueMap; + for (BasicBlock::iterator I = Start; I != End; ++I) { + PHINode* node = dyn_cast<PHINode>(I); + if (node) { + ValueVector* candidateVector = new ValueVector; + for (BasicBlock::iterator J = llvm::next(I); + J != End; ++J) { + PHINode* node2 = dyn_cast<PHINode>(J); + if (node2) { + bool match = true; + if (node->getNumIncomingValues() != + node2->getNumIncomingValues()) + continue; + + for (unsigned int i = 0; + i < node->getNumIncomingValues(); i++) { + Value* v1 = node->getIncomingValue(i); + Value* v2 = node2->getIncomingValue(i); + if (node->getIncomingBlock(i) != + node2->getIncomingBlock(i)) { + match = false; + } + // Stored sources contain original value from + // which one in phi node was extracted from + DenseMap<Value*, Value*>::iterator vi = + storedSources.find(v1); + if (vi != storedSources.end()) { + DenseMap<Value*, Value*>::iterator ji = + storedSources.find(v2); + if (ji != storedSources.end() && + (*vi).second == (*ji).second) { + } else { + match = false; + } + } else { + // Incaming value can be also constant, they + // have to match. + Constant* const1 = dyn_cast<Constant>(v1); + Constant* const2 = dyn_cast<Constant>(v2); + if (!(const1 && const2)) /* && + const1->getValue() == const2->getValue())) */{ + match = false; + } + } + } + if (match) + candidateVector->push_back(node2); + } + } + if (candidateVector->size() == VectorWidth -1) { + Value* newV = cast<Value>(node); + valueMap[newV] = candidateVector; + } + } + } + // Actually create new phi node + for (DenseMap<Value*, ValueVector*>::iterator i = + valueMap.begin(); i != valueMap.end(); i++) { + ValueVector& v = *(*i).second; + PHINode* orig = cast<PHINode>((*i).first); + Type *IType = orig->getType(); + Type *VType = getVecTypeForVector(IType); + PHINode* phi = PHINode::Create(VType, orig->getNumIncomingValues(), + getReplacementName(orig, false,0), orig); + // Add incoming pairs to the phi node. + for (unsigned int i = 0; i < orig->getNumIncomingValues(); i++) { + Value* inc = orig->getIncomingValue(i); + BasicBlock* BB = orig->getIncomingBlock(i); + DenseMap<Value*, Value*>::iterator iter = + storedSources.find(inc); + if (iter != storedSources.end()) { + phi->addIncoming((*iter).second, BB); + } else { + Constant* origConst = cast<Constant>(inc); + Constant* cons = ConstantVector::getSplat( + VectorWidth, origConst); + phi->addIncoming(cons, BB); + } + } + // Extract scalar values from phi node to be used in the body + // of basic block. Replacing their uses cause instruction combiner + // to find extractlement -> insertelement pairs and drop them + // leaving direct use of vector. + LLVMContext& Context = BB.getContext(); + BasicBlock::iterator toFill = BB.getFirstInsertionPt(); + int index = 0; + + // Find from the user of original phi node in which position it + // is inserted to the vector before being used by vector instruction. + // We have to extract it from same position of the vector phi node. + Instruction::use_iterator useiter = orig->use_begin(); + while (useiter != orig->use_end()) { + llvm::User* tmp = *useiter; + if (isa<InsertElementInst>(tmp)) { + Value* in = tmp->getOperand(2); + if (isa<ConstantInt>(in)) { + index = + cast<ConstantInt>(in)->getZExtValue(); + break; + } + } + useiter++; + } + + //} + Value *X = ConstantInt::get(Type::getInt32Ty(Context), index); + Instruction* other = ExtractElementInst::Create(phi, X, + getReplacementName(phi, false, 0)); + other->insertAfter(toFill); + orig->replaceAllUsesWith(other); + AA->replaceWithNewValue(orig, other); + SE->forgetValue(orig); + orig->eraseFromParent(); + Instruction* ins = other; + for (unsigned int i = 0; i < v.size(); i++) { + Instruction* tmp = cast<Instruction>(v[i]); + // Find from the user of original phi node in which position it + // is inserted to the vector before being used by vector instruction. + // We have to extract it from same position of the vector phi node. + Instruction::use_iterator ui = tmp->use_begin(); + while (ui != tmp->use_end()) { + llvm::User* user = *ui; + if (isa<InsertElementInst>(user)) { + Value* in = user->getOperand(2); + if (isa<ConstantInt>(in)) { + index = + cast<ConstantInt>(in)->getZExtValue(); + break; + } + } + ui++; + } + X = ConstantInt::get(Type::getInt32Ty(Context), index); + Instruction* other = ExtractElementInst::Create(phi, X, + getReplacementName(phi, false, index)); + other->insertAfter(ins); + + tmp->replaceAllUsesWith(other); + AA->replaceWithNewValue(tmp, other); + SE->forgetValue(tmp); + tmp->eraseFromParent(); + ins = other; + } + + } + return true; + } + // This function implements one vectorization iteration on the provided + // basic block. It returns true if the block is changed. + bool WIVectorize::vectorizePairs(BasicBlock &BB) { + bool ShouldContinue; + BasicBlock::iterator Start = BB.getFirstInsertionPt(); + + std::vector<Value *> AllPairableInsts; + DenseMap<Value *, Value *> AllChosenPairs; + + std::vector<Value *> PairableInsts; + std::multimap<Value *, Value *> CandidatePairs; + ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, + PairableInsts); + if (PairableInsts.empty()) return false; + // Now we have a map of all of the pairable instructions and we need to + // select the best possible pairing. A good pairing is one such that the + // users of the pair are also paired. This defines a (directed) forest + // over the pairs such that two pairs are connected iff the second pair + // uses the first. + + // Note that it only matters that both members of the second pair use some + // element of the first pair (to allow for splatting). + + std::multimap<ValuePair, ValuePair> ConnectedPairs; + computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); + + // Build the pairable-instruction dependency map + DenseSet<ValuePair> PairableInstUsers; + buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); + + // There is now a graph of the connected pairs. For each variable, pick + // the pairing with the largest tree meeting the depth requirement on at + // least one branch. Then select all pairings that are part of that tree + // and remove them from the list of available pairings and pairable + // variables. + + DenseMap<Value *, Value *> ChosenPairs; + choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, ChosenPairs); + + if (ChosenPairs.empty()) + return false; + + AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), + PairableInsts.end()); + AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); + + if (AllChosenPairs.empty()) return false; + NumFusedOps += AllChosenPairs.size(); + + // A set of pairs has now been selected. It is now necessary to replace the + // paired instructions with vector instructions. For this procedure each + // operand must be replaced with a vector operand. This vector is formed + // by using build_vector on the old operands. The replaced values are then + // replaced with a vector_extract on the result. Subsequent optimization + // passes should coalesce the build/extract combinations. + + fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); + + return true; + } + + // This function returns true if the provided instruction is capable of being + // fused into a vector instruction. This determination is based only on the + // type and other attributes of the instruction. + bool WIVectorize::isInstVectorizable(Instruction *I, + bool &IsSimpleLoadStore) { + IsSimpleLoadStore = false; + + if (MemOpsOnly && + !(isa<LoadInst>(I) || isa<StoreInst>(I) || isa<GetElementPtrInst>(I))) + return false; + + if (CallInst *C = dyn_cast<CallInst>(I)) { + if (!isVectorizableIntrinsic(C)) { + return false; + + } + } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { + // Vectorize simple loads if possbile: + IsSimpleLoadStore = L->isSimple(); + if (!IsSimpleLoadStore || NoMemOps) { + return false; + } + } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { + // Vectorize simple stores if possbile: + IsSimpleLoadStore = S->isSimple(); + if (!IsSimpleLoadStore || NoMemOps) { + return false; + } + } else if (CastInst *C = dyn_cast<CastInst>(I)) { + // We can vectorize casts, but not casts of pointer types, etc. + + Type *SrcTy = C->getSrcTy(); + if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy()) { + return false; + } + Type *DestTy = C->getDestTy(); + if (!DestTy->isSingleValueType() || DestTy->isPointerTy()) { + return false; + } + } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { + // Currently, vector GEPs exist only with one index. + if (G->getNumIndices() != 1 || NoMemOps || NoGEP) + return false; + } else if (isa<CmpInst>(I)) { + if (NoCMP) + return false; + } else if (!(I->isBinaryOp())){ /*|| isa<ShuffleVectorInst>(I) || + isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {*/ + return false; + } + // We can't vectorize memory operations without target data + if (TD == 0 && IsSimpleLoadStore) + return false; + + Type *T1, *T2; + if (isa<StoreInst>(I)) { + // For stores, it is the value type, not the pointer type that matters + // because the value is what will come from a vector register. + + Value *IVal = cast<StoreInst>(I)->getValueOperand(); + T1 = IVal->getType(); + } else { + T1 = I->getType(); + } + + if (I->isCast()) + T2 = cast<CastInst>(I)->getSrcTy(); + else + T2 = T1; + + // Not every type can be vectorized... + if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || + !(VectorType::isValidElementType(T2) || T2->isVectorTy())) { + return false; + } + if ((T1->getPrimitiveSizeInBits() > (VectorWidth*32)/2 || + T2->getPrimitiveSizeInBits() > (VectorWidth*32)/2)) { + return false; + } + + // Floating point vectorization can be dissabled + if (I->getType()->isFloatingPointTy() && NoFP) + return false; + + // Do not vectorizer pointer types. Currently do not work with LLVM 3.1. + if (!isa<GetElementPtrInst>(I) && + (T1->getScalarType()->isPointerTy() || + T2->getScalarType()->isPointerTy())) + return false; + // Check if the instruction can be loop counter, we do not vectorize those + // since they have to be same for all work items we are vectorizing + // and computations of load/store indexes usually depenends on them. + // Instruction combiner pass will remove duplicates. + if (SE->isSCEVable(I->getType())) { + const SCEV* sc = SE->getSCEV(I); + if (const SCEVAddRecExpr* S = dyn_cast<SCEVAddRecExpr>(sc)) { + if (I->hasNUses(2)) { + // Loop counter instruction is used in the comparison + // operation before branch and with the phi node. + // Any more uses indicates that the instruction is also + // used as part of some computation and possibly needs + // to get vectorize. + bool compare = false; + bool phi = false; + for (Value::use_iterator it = I->use_begin(); + it != I->use_end(); + it++) { + if (isa<CmpInst>(*it)) + compare = true; + if (isa<PHINode>(*it)) + phi = true; + } + if (compare && phi) + return false; + } + } + } + return true; + } + // This function returns true if the two provided instructions are compatible + // (meaning that they can be fused into a vector instruction). This assumes + // that I has already been determined to be vectorizable and that J is not + // in the use tree of I. + bool WIVectorize::areInstsCompatibleFromDifferentWi(Instruction *I, + Instruction *J) { + + if (I->getMetadata("wi") == NULL || J->getMetadata("wi") == NULL) { + return false; + } + if (MemOpsOnly && + !((isa<LoadInst>(I) && isa<LoadInst>(J)) || + (isa<StoreInst>(I) && isa<StoreInst>(J)) || + (isa<GetElementPtrInst>(I) && isa<GetElementPtrInst>(J)))) { + return false; + } + MDNode* mi = I->getMetadata("wi"); + MDNode* mj = J->getMetadata("wi"); + assert(mi->getNumOperands() == 3); + assert(mj->getNumOperands() == 3); + + // Second operand of MDNode contains MDNode with XYZ tripplet. + MDNode* iXYZ= dyn_cast<MDNode>(mi->getOperand(2)); + MDNode* jXYZ= dyn_cast<MDNode>(mj->getOperand(2)); + assert(iXYZ->getNumOperands() == 4); + assert(jXYZ->getNumOperands() == 4); + + ConstantInt *CIX = dyn_cast<ConstantInt>(iXYZ->getOperand(1)); + ConstantInt *CJX = dyn_cast<ConstantInt>(jXYZ->getOperand(1)); + + ConstantInt *CIY = dyn_cast<ConstantInt>(iXYZ->getOperand(2)); + ConstantInt *CJY = dyn_cast<ConstantInt>(jXYZ->getOperand(2)); + + ConstantInt *CIZ = dyn_cast<ConstantInt>(iXYZ->getOperand(3)); + ConstantInt *CJZ = dyn_cast<ConstantInt>(jXYZ->getOperand(3)); + + if ( CIX->getValue() == CJX->getValue() + && CIY->getValue() == CJY->getValue() + && CIZ->getValue() == CJZ->getValue()) { + // Same work item, no vectorizing + return false; + } + mi = I->getMetadata("wi_counter"); + mj = J->getMetadata("wi_counter"); + + ConstantInt *CI = dyn_cast<ConstantInt>(mi->getOperand(1)); + ConstantInt *CJ = dyn_cast<ConstantInt>(mj->getOperand(1)); + if (CI->getValue() != CJ->getValue()) { + // different line in the original work item + // we do not want to vectorize operations that do not match + return false; + } + return true; + } + static inline void getInstructionTypes(Instruction *I, + Type *&T1, Type *&T2) { + if (isa<StoreInst>(I)) { + // For stores, it is the value type, not the pointer type that matters + // because the value is what will come from a vector register. + + Value *IVal = cast<StoreInst>(I)->getValueOperand(); + T1 = IVal->getType(); + } else { + T1 = I->getType(); + } + + if (I->isCast()) + T2 = cast<CastInst>(I)->getSrcTy(); + else + T2 = T1; + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + T2 = SI->getCondition()->getType(); + } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { + T2 = SI->getOperand(0)->getType(); + } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { + T2 = CI->getOperand(0)->getType(); + } + } + + // This function returns true if the two provided instructions are compatible + // (meaning that they can be fused into a vector instruction). This assumes + // that I has already been determined to be vectorizable and that J is not + // in the use tree of I. + bool WIVectorize::areInstsCompatible(Instruction *I, Instruction *J, + bool IsSimpleLoadStore) { + DEBUG( if (DebugInstructionExamination) dbgs() << "WIV: looking at " << *I << + " <-> " << *J << "\n"); + + // Loads and stores can be merged if they have different alignments, + // but are otherwise the same. + LoadInst *LI, *LJ; + StoreInst *SI, *SJ; + if (!J->isSameOperationAs(I)) { + return false; + } + Type *IT1, *IT2, *JT1, *JT2; + getInstructionTypes(I, IT1, IT2); + getInstructionTypes(J, JT1, JT2); + + if (IsSimpleLoadStore || isa<GetElementPtrInst>(I)) { + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; + int64_t OffsetInElmts = 0; + bool foundPointer = + getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, OffsetInElmts); + if ( foundPointer && abs64(OffsetInElmts) == 1) { + Type *aTypeI = isa<StoreInst>(I) ? + cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); + Type *aTypeJ = isa<StoreInst>(J) ? + cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + Type *VType = getVecTypeForPair(aTypeI, aTypeJ); + // An aligned load or store is possible only if the instruction + // with the lower offset has an alignment suitable for the + // vector type. + + unsigned BottomAlignment = IAlignment; + if (OffsetInElmts < 0) BottomAlignment = JAlignment; + + unsigned VecAlignment = TD->getPrefTypeAlignment(VType); + if (AlignedOnly) { + if (BottomAlignment < VecAlignment) { + return false; + } + } +#ifndef LLVM_3_1 + if (VTTI) { + unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(), + IAlignment, IAddressSpace); + unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(), + JAlignment, JAddressSpace); + unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType, + BottomAlignment, + IAddressSpace); + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts = VTTI->getNumberOfParts(VType); + if (VParts > 1) + return false; + else if (!VParts && VCost == ICost + JCost) + return false; + + } +#endif + } else if(foundPointer && abs64(OffsetInElmts)>1){ + if (isa<GetElementPtrInst>(I)) { + return true; + } + // Collect information on memory accesses with stride. + // This is not usefull for anything, just to analyze code a bit. + if (I->getMetadata("wi") != NULL) { + MDNode* md = I->getMetadata("wi"); + MDNode* mdCounter = I->getMetadata("wi_counter"); + MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1)); + + unsigned CI = + cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue(); + unsigned RI = + cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue(); + std::pair<int, int> index = std::pair<int,int>(RI,CI); + DenseMap<std::pair<int,int>, ValueVector*>::iterator it = + stridedOps.find(index); + ValueVector* v = NULL; + if (it != stridedOps.end()) { + v = (*it).second; + } else { + v = new ValueVector; + } + v->push_back(I); + v->push_back(J); + stridedOps.insert( + std::pair< std::pair<int, int>, ValueVector*>(index, v)); + } + return false; + } else { + return false; + } + } else if (isa<ShuffleVectorInst>(I)) { + // Only merge two shuffles if they're both constant + return isa<Constant>(I->getOperand(2)) && + isa<Constant>(J->getOperand(2)); + // FIXME: We may want to vectorize non-constant shuffles also. +#ifdef LLVM_3_1 + } +#else + } else if (VTTI) { + unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); + unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); + Type *VT1 = getVecTypeForPair(IT1, JT1), + *VT2 = getVecTypeForPair(IT2, JT2); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); + + if (VCost > ICost + JCost) { + return false; + } + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts1 = VTTI->getNumberOfParts(VT1), + VParts2 = VTTI->getNumberOfParts(VT2); + if (VParts1 > 1 || VParts2 > 1) + return false; + else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) + return false; + + //CostSavings = ICost + JCost - VCost; + } +#endif + // The powi intrinsic is special because only the first argument is + // vectorized, the second arguments must be equal. + CallInst *CI = dyn_cast<CallInst>(I); + Function *FI; + if (CI && (FI = CI->getCalledFunction()) && + FI->getIntrinsicID() == Intrinsic::powi) { + + Value *A1I = CI->getArgOperand(1), + *A1J = cast<CallInst>(J)->getArgOperand(1); + const SCEV *A1ISCEV = SE->getSCEV(A1I), + *A1JSCEV = SE->getSCEV(A1J); + return (A1ISCEV == A1JSCEV); + } + return true; + } + + // Figure out whether or not J uses I and update the users and write-set + // structures associated with I. Specifically, Users represents the set of + // instructions that depend on I. WriteSet represents the set + // of memory locations that are dependent on I. If UpdateUsers is true, + // and J uses I, then Users is updated to contain J and WriteSet is updated + // to contain any memory locations to which J writes. The function returns + // true if J uses I. By default, alias analysis is used to determine + // whether J reads from memory that overlaps with a location in WriteSet. + // If LoadMoveSet is not null, then it is a previously-computed multimap + // where the key is the memory-based user instruction and the value is + // the instruction to be compared with I. So, if LoadMoveSet is provided, + // then the alias analysis is not used. This is necessary because this + // function is called during the process of moving instructions during + // vectorization and the results of the alias analysis are not stable during + // that process. + bool WIVectorize::trackUsesOfI(DenseSet<Value *> &Users, + AliasSetTracker &WriteSet, Instruction *I, + Instruction *J, bool UpdateUsers, + std::multimap<Value *, Value *> *LoadMoveSet) { + bool UsesI = false; + + // This instruction may already be marked as a user due, for example, to + // being a member of a selected pair. + if (Users.count(J)) + UsesI = true; + + if (!UsesI) + for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); + JU != JE; ++JU) { + Value *V = *JU; + if (I == V || Users.count(V)) { + UsesI = true; + break; + } + } + if (!UsesI && J->mayReadFromMemory()) { + if (LoadMoveSet) { + VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); + UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); + } + } + + if (UsesI && UpdateUsers) { + if (J->mayWriteToMemory()) WriteSet.add(J); + Users.insert(J); + } + + return UsesI; + } + + // This function iterates over all instruction pairs in the provided + // basic block and collects all candidate pairs for vectorization. + bool WIVectorize::getCandidatePairs(BasicBlock &BB, + BasicBlock::iterator &Start, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts) { + BasicBlock::iterator E = BB.end(); + LLVMContext& context = BB.getContext(); + + if (Start == E) return false; + + std::multimap<int, ValueVector*> temporary; + for (BasicBlock::iterator I = Start++; I != E; ++I) { + + if (I->getMetadata("wi") == NULL) + continue; + bool IsSimpleLoadStore; + if (!isInstVectorizable(I, IsSimpleLoadStore)) { + continue; + } + + MDNode* md = I->getMetadata("wi"); + MDNode* mdCounter = I->getMetadata("wi_counter"); + MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1)); + + unsigned CI = cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue(); + unsigned RI = cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue(); + + std::multimap<int,ValueVector*>::iterator itb = temporary.lower_bound(CI); + std::multimap<int,ValueVector*>::iterator ite = temporary.upper_bound(CI); + ValueVector* tmpVec = NULL; + while(itb != ite) { + if (I->isSameOperationAs(cast<Instruction>((*(*itb).second)[0]))) { + // Test also if instructions are from same region. + MDNode* tmpMD = + cast<Instruction>((*(*itb).second)[0])->getMetadata("wi"); + MDNode* tmpRINode = dyn_cast<MDNode>(tmpMD->getOperand(1)); + unsigned tmpRI = + cast<ConstantInt>(tmpRINode->getOperand(1))->getZExtValue(); + if (RI == tmpRI) + tmpVec = (*itb).second; + } + itb++; + } + if (tmpVec == NULL) { + tmpVec = new ValueVector; + temporary.insert(std::pair<int, ValueVector*>(CI, tmpVec)); + } + tmpVec->push_back(I); + } + DenseSet<Value *> Users; + AliasSetTracker WriteSet(*AA); + for (std::multimap<int, ValueVector*>::iterator insIt = temporary.begin(); + insIt != temporary.end(); insIt++) { + ValueVector* tmpVec = (*insIt).second; + // Prevent creation of vectors shorter then the vector width in case + // vectorization of asymetric counters is disabled. + if (tmpVec->size() % 2 != 0 && NoCount) { + continue; + } + + if (tmpVec->size() % 2 != 0 && !MemOpsOnly) { + + // Ok, this is extremely ugly, however this code is specific for + // for situation where the base address of some array is computed + // one way and the addresses for the rest of the work items are + // computed other way. E.g. + // id_0 = x*y*z + // id_1 = id_0 + const + // id_2 = id_0 + const + const + // ... + // Therefore only applicable to add operation. + // It should bring some performance improvements when targetting TTA. + + // NOTE: results are opposide of what is expected. + // With NoCount set to true, the vectorization of loop counter arithmetic + // operations is actually prevented. The ProgramPartitioner is assigning + // them to the lanes. This seems to provide better performance. + // With NoCount set to false, the vectorization of loop counter + // arithmetic is allowed, creating better bitcode, but when mapped + // to TTA, performance is much worse. + + Instruction* tmp = cast<Instruction>((*tmpVec)[0]); + if ( !(tmpVec->size() == 1 || + tmp->getType()->isVectorTy() || + tmp->getOpcode() != Instruction::Add)) { + + bool identity = false; + bool argumentOperand = false; + // If none of the arguments to add is constant + // we do not replace it with identity, neither if operand + // is function argument since that can be used in different + // blocks. + for (unsigned o = 0; o < tmp->getNumOperands(); ++o) { + if (isa<ConstantInt>(tmp->getOperand(o))) { + identity = true; + } + if (isa<Argument>(tmp->getOperand(o))) { + argumentOperand = true; + } + } + if (!identity || argumentOperand) + continue; + + Instruction* K = tmp->clone(); + if ((*tmpVec)[0]->hasName()) { + std::string name = (*tmpVec)[0]->getName().str() + "_temp_0"; + K->setName(name); + } + + if (tmp->getMetadata("wi") != NULL) { + MDNode* md = tmp->getMetadata("wi"); + MDNode* xyz = dyn_cast<MDNode>(md->getOperand(2)); + MDNode* region = dyn_cast<MDNode>(md->getOperand(1)); + ConstantInt *CIX = + dyn_cast<ConstantInt>(xyz->getOperand(1)); + ConstantInt *CIY = + dyn_cast<ConstantInt>(xyz->getOperand(2)); + ConstantInt *CIZ = + dyn_cast<ConstantInt>(xyz->getOperand(3)); + if (CIX->getValue() == 1) { + Value *v2[] = { + MDString::get(context, "WI_xyz"), + ConstantInt::get(Type::getInt32Ty(context), 0), + CIY, + CIZ}; + MDNode* newXYZ = MDNode::get(context, v2); + Value *v[] = { + MDString::get(context, "WI_data"), + region, + newXYZ}; + MDNode* mdNew = MDNode::get(context, v); + K->setMetadata("wi", mdNew); + K->setMetadata("wi_counter", tmp->getMetadata("wi_counter")); + } + } + for (unsigned o = 0; o < K->getNumOperands(); ++o) { + if (isa<ConstantInt>(K->getOperand(o))) { + K->setOperand(o, + ConstantInt::get(K->getOperand(o)->getType(), 0)); + } + } + + Value* original = NULL; + for (unsigned o = 0; o < K->getNumOperands(); ++o) { + if (!isa<PHINode>(K->getOperand(o)) && + isa<Instruction>(K->getOperand(o))) { + original = K->getOperand(o); + } + } + if (original != NULL) { + K->insertAfter(cast<Instruction>(original)); + std::vector<User*> usesToReplace; + for (Value::use_iterator it = original->use_begin(); + it != original->use_end(); + it++) { + bool usedInVec = false; + if (*it != K) { + if (!NoCount) { + for (unsigned int j = 0; j < tmpVec->size(); j++) { + if ((*it) == (*tmpVec)[j]) { + usedInVec = true; + break; + } + } + } + if (!usedInVec) { + usesToReplace.push_back(*it); + } + } + } + for (unsigned int j = 0; j < usesToReplace.size(); j++) { + usesToReplace[j]->replaceUsesOfWith(original, K); + } + } else { + K->insertBefore(tmp); + } + tmpVec->insert(tmpVec->begin(), K); + } + } + + // Create actual candidate pairs + for (unsigned j = 0; j < tmpVec->size()/2; j++) { + Instruction* I = cast<Instruction>((*tmpVec)[2*j]); + Instruction* J = cast<Instruction>((*tmpVec)[2*j+1]); + if (!areInstsCompatibleFromDifferentWi(I,J)) continue; + bool IsSimpleLoadStore; + + if (!isInstVectorizable(I, IsSimpleLoadStore)) { + break; + } + + if (!areInstsCompatible(I, J, IsSimpleLoadStore)) { + break; + } + + // Determine if J uses I, if so, exit the loop. + bool UsesI = trackUsesOfI(Users, WriteSet, I, J, true); + if (UsesI) { + break; + } + + if (!PairableInsts.size() || + PairableInsts[PairableInsts.size()-1] != I) { + PairableInsts.push_back(I); + } + CandidatePairs.insert(ValuePair(I, J)); + } + } + return false; + } + + // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that + // it looks for pairs such that both members have an input which is an + // output of PI or PJ. + void WIVectorize::computePairsConnectedTo( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *>& /*PairableInsts*/, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + ValuePair P) { + StoreInst *SI, *SJ; + // For each possible pairing for this variable, look at the uses of + // the first value... + for (Value::use_iterator I = P.first->use_begin(), + E = P.first->use_end(); I != E; ++I) { + if (isa<LoadInst>(*I)) { + // A pair cannot be connected to a load because the load only takes one + // operand (the address) and it is a scalar even after vectorization. + continue; + } else if ((SI = dyn_cast<StoreInst>(*I)) && + P.first == SI->getPointerOperand()) { + // Similarly, a pair cannot be connected to a store through its + // pointer operand. + continue; + } + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); + + // For each use of the first variable, look for uses of the second + // variable... + for (Value::use_iterator J = P.second->use_begin(), + E2 = P.second->use_end(); J != E2; ++J) { + + if ((SJ = dyn_cast<StoreInst>(*J)) && + P.second == SJ->getPointerOperand()) + continue; + + VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); + + // Look for <I, J>: + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + + // Look for <J, I>: + if (isSecondInIteratorPair<Value*>(*I, JPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); + } + // Look for cases where just the first value in the pair is used by + // both members of another pair (splatting). + for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { + if ((SJ = dyn_cast<StoreInst>(*J)) && + P.first == SJ->getPointerOperand()) + continue; + + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + } + } + // Look for cases where just the second value in the pair is used by + // both members of another pair (splatting). + for (Value::use_iterator I = P.second->use_begin(), + E = P.second->use_end(); I != E; ++I) { + if (isa<LoadInst>(*I)) { + continue; + } else if ((SI = dyn_cast<StoreInst>(*I)) && + P.second == SI->getPointerOperand()) { + continue; + } + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); + + for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { + if ((SJ = dyn_cast<StoreInst>(*J)) && + P.second == SJ->getPointerOperand()) + continue; + + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + } + } + } + + // This function figures out which pairs are connected. Two pairs are + // connected if some output of the first pair forms an input to both members + // of the second pair. + void WIVectorize::computeConnectedPairs( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs) { + + for (std::vector<Value *>::iterator PI = PairableInsts.begin(), + PE = PairableInsts.end(); PI != PE; ++PI) { + VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); + + for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; + P != choiceRange.second; ++P) + computePairsConnectedTo(CandidatePairs, PairableInsts, + ConnectedPairs, *P); + } + + DEBUG(dbgs() << "WIV: found " << ConnectedPairs.size() + << " pair connections.\n"); + } + + // This function builds a set of use tuples such that <A, B> is in the set + // if B is in the use tree of A. If B is in the use tree of A, then B + // depends on the output of A. + void WIVectorize::buildDepMap( + BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *>& /*PairableInsts*/, + DenseSet<ValuePair> &PairableInstUsers) { + DenseSet<Value *> IsInPair; + for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(), + E = CandidatePairs.end(); C != E; ++C) { + IsInPair.insert(C->first); + IsInPair.insert(C->second); + } + + // Iterate through the basic block, recording all Users of each + // pairable instruction. + + BasicBlock::iterator E = BB.end(); + for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { + if (IsInPair.find(I) == IsInPair.end()) continue; + + DenseSet<Value *> Users; + AliasSetTracker WriteSet(*AA); + for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) + (void) trackUsesOfI(Users, WriteSet, I, J); + + for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); + U != E; ++U) + PairableInstUsers.insert(ValuePair(I, *U)); + } + } + + // Returns true if an input to pair P is an output of pair Q and also an + // input of pair Q is an output of pair P. If this is the case, then these + // two pairs cannot be simultaneously fused. + bool WIVectorize::pairsConflict(ValuePair P, ValuePair Q, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> *PairableInstUserMap) { + + // Two pairs are in conflict if they are mutual Users of eachother. + bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || + PairableInstUsers.count(ValuePair(P.first, Q.second)) || + PairableInstUsers.count(ValuePair(P.second, Q.first)) || + PairableInstUsers.count(ValuePair(P.second, Q.second)); + bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || + PairableInstUsers.count(ValuePair(Q.first, P.second)) || + PairableInstUsers.count(ValuePair(Q.second, P.first)) || + PairableInstUsers.count(ValuePair(Q.second, P.second)); + if (PairableInstUserMap) { + // FIXME: The expensive part of the cycle check is not so much the cycle + // check itself but this edge insertion procedure. This needs some + // profiling and probably a different data structure (same is true of + // most uses of std::multimap). + if (PUsesQ) { + VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); + if (!isSecondInIteratorPair(P, QPairRange)) + PairableInstUserMap->insert(VPPair(Q, P)); + } + if (QUsesP) { + VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); + if (!isSecondInIteratorPair(Q, PPairRange)) + PairableInstUserMap->insert(VPPair(P, Q)); + } + } + + return (QUsesP && PUsesQ); + } + + // This function walks the use graph of current pairs to see if, starting + // from P, the walk returns to P. + bool WIVectorize::pairWillFormCycle(ValuePair P, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseSet<ValuePair> &CurrentPairs) { + + DEBUG(if (DebugCycleCheck) + dbgs() << "WIV: starting cycle check for : " << *P.first << " <-> " + << *P.second << "\n"); + // A lookup table of visisted pairs is kept because the PairableInstUserMap + // contains non-direct associations. + DenseSet<ValuePair> Visited; + SmallVector<ValuePair, 32> Q; + // General depth-first post-order traversal: + Q.push_back(P); + do { + ValuePair QTop = Q.pop_back_val(); + Visited.insert(QTop); + + DEBUG(if (DebugCycleCheck) + dbgs() << "WIV: cycle check visiting: " << *QTop.first << " <-> " + << *QTop.second << "\n"); + VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop); + for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first; + C != QPairRange.second; ++C) { + if (C->second == P) { + DEBUG(dbgs() + << "WIV: rejected to prevent non-trivial cycle formation: " + << *C->first.first << " <-> " << *C->first.second << "\n"); + return true; + } + + if (CurrentPairs.count(C->second) && !Visited.count(C->second)) + Q.push_back(C->second); + } + } while (!Q.empty()); + + return false; + } + + // This function builds the initial tree of connected pairs with the + // pair J at the root. + void WIVectorize::buildInitialTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *>& /*PairableInsts*/, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair>& /*PairableInstUsers*/, + DenseMap<Value *, Value *>& /*ChosenPairs*/, + DenseMap<ValuePair, size_t> &Tree, ValuePair J) { + // Each of these pairs is viewed as the root node of a Tree. The Tree + // is then walked (depth-first). As this happens, we keep track of + // the pairs that compose the Tree and the maximum depth of the Tree. + SmallVector<ValuePairWithDepth, 32> Q; + // General depth-first post-order traversal: + Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); + do { + ValuePairWithDepth QTop = Q.back(); + + // Push each child onto the queue: + bool MoreChildren = false; + size_t MaxChildDepth = QTop.second; + VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first); + for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first; + k != qtRange.second; ++k) { + // Make sure that this child pair is still a candidate: + bool IsStillCand = false; + VPIteratorPair checkRange = + CandidatePairs.equal_range(k->second.first); + for (std::multimap<Value *, Value *>::iterator m = checkRange.first; + m != checkRange.second; ++m) { + if (m->second == k->second.second) { + IsStillCand = true; + break; + } + } + + if (IsStillCand) { + DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second); + if (C == Tree.end()) { + size_t d = getDepthFactor(k->second.first); + Q.push_back(ValuePairWithDepth(k->second, QTop.second+d)); + MoreChildren = true; + } else { + MaxChildDepth = std::max(MaxChildDepth, C->second); + } + } + } + + if (!MoreChildren) { + // Record the current pair as part of the Tree: + Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); + Q.pop_back(); + } + } while (!Q.empty()); + } + + // Given some initial tree, prune it by removing conflicting pairs (pairs + // that cannot be simultaneously chosen for vectorization). + void WIVectorize::pruneTreeFor( + std::multimap<Value *, Value *> &/*CandidatePairs*/, + std::vector<Value *> &/*PairableInsts*/, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &Tree, + DenseSet<ValuePair> &PrunedTree, ValuePair J, + bool UseCycleCheck) { + SmallVector<ValuePairWithDepth, 32> Q; + // General depth-first post-order traversal: + Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); + do { + ValuePairWithDepth QTop = Q.pop_back_val(); + PrunedTree.insert(QTop.first); + + // Visit each child, pruning as necessary... + DenseMap<ValuePair, size_t> BestChildren; + VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); + for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; + K != QTopRange.second; ++K) { + DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second); + if (C == Tree.end()) continue; + + // This child is in the Tree, now we need to make sure it is the + // best of any conflicting children. There could be multiple + // conflicting children, so first, determine if we're keeping + // this child, then delete conflicting children as necessary. + + // It is also necessary to guard against pairing-induced + // dependencies. Consider instructions a .. x .. y .. b + // such that (a,b) are to be fused and (x,y) are to be fused + // but a is an input to x and b is an output from y. This + // means that y cannot be moved after b but x must be moved + // after b for (a,b) to be fused. In other words, after + // fusing (a,b) we have y .. a/b .. x where y is an input + // to a/b and x is an output to a/b: x and y can no longer + // be legally fused. To prevent this condition, we must + // make sure that a child pair added to the Tree is not + // both an input and output of an already-selected pair. + + // Pairing-induced dependencies can also form from more complicated + // cycles. The pair vs. pair conflicts are easy to check, and so + // that is done explicitly for "fast rejection", and because for + // child vs. child conflicts, we may prefer to keep the current + // pair in preference to the already-selected child. + DenseSet<ValuePair> CurrentPairs; + + bool CanAdd = true; + for (DenseMap<ValuePair, size_t>::iterator C2 + = BestChildren.begin(), E2 = BestChildren.end(); + C2 != E2; ++C2) { + if (C2->first.first == C->first.first || + C2->first.first == C->first.second || + C2->first.second == C->first.first || + C2->first.second == C->first.second || + pairsConflict(C2->first, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + if (C2->second >= C->second) { + CanAdd = false; + break; + } + + CurrentPairs.insert(C2->first); + } + } + if (!CanAdd) continue; + + // Even worse, this child could conflict with another node already + // selected for the Tree. If that is the case, ignore this child. + for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(), + E2 = PrunedTree.end(); T != E2; ++T) { + if (T->first == C->first.first || + T->first == C->first.second || + T->second == C->first.first || + T->second == C->first.second || + pairsConflict(*T, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + CanAdd = false; + break; + } + + CurrentPairs.insert(*T); + } + if (!CanAdd) continue; + + // And check the queue too... + for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), + E2 = Q.end(); C2 != E2; ++C2) { + if (C2->first.first == C->first.first || + C2->first.first == C->first.second || + C2->first.second == C->first.first || + C2->first.second == C->first.second || + pairsConflict(C2->first, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + CanAdd = false; + break; + } + + CurrentPairs.insert(C2->first); + } + if (!CanAdd) continue; + + // Last but not least, check for a conflict with any of the + // already-chosen pairs. + for (DenseMap<Value *, Value *>::iterator C2 = + ChosenPairs.begin(), E2 = ChosenPairs.end(); + C2 != E2; ++C2) { + if (pairsConflict(*C2, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + CanAdd = false; + break; + } + + CurrentPairs.insert(*C2); + } + if (!CanAdd) continue; + + // To check for non-trivial cycles formed by the addition of the + // current pair we've formed a list of all relevant pairs, now use a + // graph walk to check for a cycle. We start from the current pair and + // walk the use tree to see if we again reach the current pair. If we + // do, then the current pair is rejected. + + // FIXME: It may be more efficient to use a topological-ordering + // algorithm to improve the cycle check. This should be investigated. + if (UseCycleCheck && + pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) + continue; + + // This child can be added, but we may have chosen it in preference + // to an already-selected child. Check for this here, and if a + // conflict is found, then remove the previously-selected child + // before adding this one in its place. + for (DenseMap<ValuePair, size_t>::iterator C2 + = BestChildren.begin(); C2 != BestChildren.end();) { + if (C2->first.first == C->first.first || + C2->first.first == C->first.second || + C2->first.second == C->first.first || + C2->first.second == C->first.second || + pairsConflict(C2->first, C->first, PairableInstUsers)) + BestChildren.erase(C2++); + else + ++C2; + } + + BestChildren.insert(ValuePairWithDepth(C->first, C->second)); + } + + for (DenseMap<ValuePair, size_t>::iterator C + = BestChildren.begin(), E2 = BestChildren.end(); + C != E2; ++C) { + size_t DepthF = getDepthFactor(C->first.first); + Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); + } + } while (!Q.empty()); + } + + // This function finds the best tree of mututally-compatible connected + // pairs, given the choice of root pairs as an iterator range. + void WIVectorize::findBestTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, + size_t &BestEffSize, VPIteratorPair ChoiceRange, + bool UseCycleCheck) { + for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; + J != ChoiceRange.second; ++J) { + + // Before going any further, make sure that this pair does not + // conflict with any already-selected pairs (see comment below + // near the Tree pruning for more details). + DenseSet<ValuePair> ChosenPairSet; + bool DoesConflict = false; + for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), + E = ChosenPairs.end(); C != E; ++C) { + if (pairsConflict(*C, *J, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + DoesConflict = true; + break; + } + + ChosenPairSet.insert(*C); + } + if (DoesConflict) continue; + + if (UseCycleCheck && + pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet)) + continue; + + DenseMap<ValuePair, size_t> Tree; + buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, ChosenPairs, Tree, *J); + + // Because we'll keep the child with the largest depth, the largest + // depth is still the same in the unpruned Tree. + size_t MaxDepth = Tree.lookup(*J); + + DEBUG(if (DebugPairSelection) dbgs() << "WIV: found Tree for pair {" + << *J->first << " <-> " << *J->second << "} of depth " << + MaxDepth << " and size " << Tree.size() << "\n"); + + // At this point the Tree has been constructed, but, may contain + // contradictory children (meaning that different children of + // some tree node may be attempting to fuse the same instruction). + // So now we walk the tree again, in the case of a conflict, + // keep only the child with the largest depth. To break a tie, + // favor the first child. + + DenseSet<ValuePair> PrunedTree; + pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, + PrunedTree, *J, UseCycleCheck); + + size_t EffSize = 0; + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) + EffSize += getDepthFactor(S->first); + + DEBUG(if (DebugPairSelection) + dbgs() << "WIV: found pruned Tree for pair {" + << *J->first << " <-> " << *J->second << "} of depth " << + MaxDepth << " and size " << PrunedTree.size() << + " (effective size: " << EffSize << ")\n"); +#if defined LLVM_3_1 + if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) { +#else + if ((VTTI || MaxDepth >= ReqChainDepth) && EffSize > BestEffSize) { +#endif + BestMaxDepth = MaxDepth; + BestEffSize = EffSize; + BestTree = PrunedTree; + } + } + } + + // Given the list of candidate pairs, this function selects those + // that will be fused into vector instructions. + void WIVectorize::choosePairs( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *>& ChosenPairs) { + bool UseCycleCheck = true; + std::multimap<ValuePair, ValuePair> PairableInstUserMap; + for (std::vector<Value *>::iterator I = PairableInsts.begin(), + E = PairableInsts.end(); I != E; ++I) { + // The number of possible pairings for this variable: + size_t NumChoices = CandidatePairs.count(*I); + if (!NumChoices) continue; + + VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); + + // The best pair to choose and its tree: + size_t BestMaxDepth = 0, BestEffSize = 0; + DenseSet<ValuePair> BestTree; + findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, PairableInstUserMap, ChosenPairs, + BestTree, BestMaxDepth, BestEffSize, ChoiceRange, + UseCycleCheck); + + // A tree has been chosen (or not) at this point. If no tree was + // chosen, then this instruction, I, cannot be paired (and is no longer + // considered). + + DEBUG(if (BestTree.size() > 0) + dbgs() << "WIV: selected pairs in the best tree for: " + << *cast<Instruction>(*I) << "\n"); + + for (DenseSet<ValuePair>::iterator S = BestTree.begin(), + SE2 = BestTree.end(); S != SE2; ++S) { + // Insert the members of this tree into the list of chosen pairs. + ChosenPairs.insert(ValuePair(S->first, S->second)); + DEBUG(dbgs() << "WIV: selected pair: " << *S->first << " <-> " << + *S->second << "\n"); + + // Remove all candidate pairs that have values in the chosen tree. + for (std::multimap<Value *, Value *>::iterator K = + CandidatePairs.begin(); K != CandidatePairs.end();) { + if (K->first == S->first || K->second == S->first || + K->second == S->second || K->first == S->second) { + // Don't remove the actual pair chosen so that it can be used + // in subsequent tree selections. + if (!(K->first == S->first && K->second == S->second)) + CandidatePairs.erase(K++); + else + ++K; + } else { + ++K; + } + } + } + } + + DEBUG(dbgs() << "WIV: selected " << ChosenPairs.size() << " pairs.\n"); + } + + // Returns the value that is to be used as the pointer input to the vector + // instruction that fuses I with J. + Value *WIVectorize::getReplacementPointerInput(LLVMContext& /*Context*/, + Instruction *I, Instruction *J, unsigned o, + bool FlipMemInputs) { + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; + int64_t OffsetInElmts; + + // Note: the analysis might fail here, that is why the pair order has + // been precomputed (OffsetInElmts must be unused here). + (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, + OffsetInElmts); + + // The pointer value is taken to be the one with the lowest offset. + Value *VPtr; + if (!FlipMemInputs) { + VPtr = IPtr; + } else { + FlipMemInputs = true; + VPtr = JPtr; + } + + // If pointer source is another bitcast, go directly to original + // instruction. + if (isa<BitCastInst>(VPtr)) { + VPtr = cast<BitCastInst>(VPtr)->getOperand(0); + } + Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); + Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); + Type *VArgPtrType = PointerType::get(VArgType, + cast<PointerType>(IPtr->getType())->getAddressSpace()); + BitCastInst* b = new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), + /* insert before */ FlipMemInputs ? J : I); + + if (I->getMetadata("wi") != NULL) { + b->setMetadata("wi", I->getMetadata("wi")); + b->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + return b; + } + + void WIVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, + unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, + unsigned IdxOffset, std::vector<Constant*> &Mask) { + for (unsigned v = 0; v < NumElem/2; ++v) { + int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); + if (m < 0) { + Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); + } else { + unsigned mm = m + (int) IdxOffset; + if (m >= (int) NumInElem) + mm += (int) NumInElem; + + Mask[v+MaskOffset] = + ConstantInt::get(Type::getInt32Ty(Context), mm); + } + } + } + + // Returns the value that is to be used as the vector-shuffle mask to the + // vector instruction that fuses I with J. + Value *WIVectorize::getReplacementShuffleMask(LLVMContext& Context, + Instruction *I, Instruction *J) { + // This is the shuffle mask. We need to append the second + // mask to the first, and the numbers need to be adjusted. + + Type *ArgTypeI = I->getType(); + Type *ArgTypeJ = J->getType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); + // Get the total number of elements in the fused vector type. + // By definition, this must equal the number of elements in + // the final mask. + unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); + std::vector<Constant*> Mask(NumElem); + + Type *OpType = I->getOperand(0)->getType(); + unsigned NumInElem = cast<VectorType>(OpType)->getNumElements(); + + // For the mask from the first pair... + fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); + + // For the mask from the second pair... + fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, + Mask); + + return ConstantVector::get(Mask); + } + + Value *WIVectorize::CommonShuffleSource(Instruction *I, Instruction *J) { + DenseMap<Value*, Value*>::iterator vi = storedSources.find(I); + DenseMap<Value*, Value*>::iterator vj = storedSources.find(J); + if (vi != storedSources.end() + && vj != storedSources.end()) { + if ((*vi).second == (*vj).second) { + return (*vi).second; + } + } + return NULL; + } + // Returns the value to be used as the specified operand of the vector + // instruction that fuses I with J. + Value *WIVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool FlipMemInputs) { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); + + // Compute the fused vector type for this operand + Type *ArgType = I->getOperand(o)->getType(); + Type *ArgTypeJ = J->getOperand(o)->getType(); + VectorType *VArgType = getVecTypeForPair(ArgType, ArgTypeJ); + Instruction *L = I, *H = J; + if (FlipMemInputs) { + L = J; + H = I; + } + + if (ArgType->isVectorTy()) { + ShuffleVectorInst *LSV + = dyn_cast<ShuffleVectorInst>(L->getOperand(o)); + ShuffleVectorInst *HSV + = dyn_cast<ShuffleVectorInst>(H->getOperand(o)); + if (LSV && HSV && + LSV->getOperand(0)->getType() == HSV->getOperand(0)->getType() && + LSV->getOperand(1)->getType() == HSV->getOperand(1)->getType() && + LSV->getOperand(2)->getType() == HSV->getOperand(2)->getType()) { + if (LSV->getOperand(0) == HSV->getOperand(0) && + LSV->getOperand(1) == HSV->getOperand(1)) { + if (LSV->getOperand(2)->getType()->getVectorNumElements() == + HSV->getOperand(2)->getType()->getVectorNumElements()) { + unsigned elems = + LSV->getOperand(2)->getType()->getVectorNumElements(); + bool continous = true; + bool identical = true; + unsigned start = cast<ShuffleVectorInst>(LSV)->getMaskValue(0); + for (unsigned i = 0; i < elems; i++) { + unsigned m = cast<ShuffleVectorInst>(LSV)->getMaskValue(i); + if (m != i) + continous = false; + if (m != start) + identical = false; + unsigned n = cast<ShuffleVectorInst>(HSV)->getMaskValue(i); + if (n != i + elems) + continous = false; + if (n != start) + identical = false; + } + // This is the case where both sources come from same value and + // are in order. e.g. 0,1,2,3,4,5,6,7, as produced when + // replacing outputs of vector operation. + if (continous && VArgType->getVectorNumElements() == elems*2) { + return LSV->getOperand(0); + } + // This is case where single value of input vector is replicated + // to whole output. Eventually should turn to buildvector MI. + if (identical) { + unsigned numElem = + cast<VectorType>(VArgType)->getNumElements(); + std::vector<Constant*> Mask(numElem); + for (unsigned v = 0; v < numElem; ++v) + Mask[v] = + ConstantInt::get(Type::getInt32Ty(Context), start); + + Instruction *BV = new ShuffleVectorInst( + (start < numElem/2) ? + LSV->getOperand(0): + LSV->getOperand(1), + UndefValue::get(LSV->getOperand(0)->getType()), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + if (LSV->getMetadata("wi") != NULL) { + BV->setMetadata("wi", LSV->getMetadata("wi")); + BV->setMetadata("wi_counter", LSV->getMetadata("wi_counter")); + } + BV->insertBefore(J); + return BV; + } + } + } +#if 0 + // This was made obsolete by test for continuity of shuffle indexes above + // and should be removed after futher tests for performance degradation. + Value* res = CommonShuffleSource(LSV, HSV); + if (res && + res->getType()->getVectorNumElements() == + VArgType->getVectorNumElements()) { + return res; + } +#endif + } + InsertElementInst *LIN + = dyn_cast<InsertElementInst>(L->getOperand(o)); + InsertElementInst *HIN + = dyn_cast<InsertElementInst>(H->getOperand(o)); + + unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); + if (LIN && HIN) { + Instruction *newIn = InsertElementInst::Create( + UndefValue::get(VArgType), + LIN->getOperand(1), + LIN->getOperand(2), + getReplacementName(I, true, o, 1)); + if (I->getMetadata("wi")) { + newIn->setMetadata("wi", I->getMetadata("wi")); + newIn->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + newIn->insertBefore(J); + + LIN = dyn_cast<InsertElementInst>(LIN->getOperand(0)); + int counter = 2; + int rounds = 0; + while (rounds < 2) { + while(LIN) { + unsigned Indx = cast<ConstantInt>(LIN->getOperand(2))->getZExtValue(); + Indx += rounds * (numElem/2); + Value *newIndx = ConstantInt::get(Type::getInt32Ty(Context), Indx); + newIn = InsertElementInst::Create( + newIn, + LIN->getOperand(1), + newIndx, + getReplacementName(I, true, o ,counter)); + counter++; + if (I->getMetadata("wi")) { + newIn->setMetadata("wi", I->getMetadata("wi")); + newIn->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + newIn->insertBefore(J); + LIN = dyn_cast<InsertElementInst>(LIN->getOperand(0)); + } + rounds ++; + LIN = HIN; + } + return newIn; + + } + std::vector<Constant*> Mask(numElem); + for (unsigned v = 0; v < numElem; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + + Instruction *BV = new ShuffleVectorInst(L->getOperand(o), + H->getOperand(o), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + if (L->getMetadata("wi") != NULL) { + BV->setMetadata("wi", L->getMetadata("wi")); + BV->setMetadata("wi_counter", L->getMetadata("wi_counter")); + } + BV->insertBefore(J); + return BV; + } + + // If these two inputs are the output of another vector instruction, + // then we should use that output directly. It might be necessary to + // permute it first. [When pairings are fused recursively, you can + // end up with cases where a large vector is decomposed into scalars + // using extractelement instructions, then built into size-2 + // vectors using insertelement and the into larger vectors using + // shuffles. InstCombine does not simplify all of these cases well, + // and so we make sure that shuffles are generated here when possible. + ExtractElementInst *LEE + = dyn_cast<ExtractElementInst>(L->getOperand(o)); + ExtractElementInst *HEE + = dyn_cast<ExtractElementInst>(H->getOperand(o)); + + if (LEE && HEE && + LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { + VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType()); + unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue(); + unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue(); + if (LEE->getOperand(0) == HEE->getOperand(0)) { + if (LowIndx == 0 && HighIndx == 1) + return LEE->getOperand(0); + + std::vector<Constant*> Mask(2); + Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); + Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); + + Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), + UndefValue::get(EEType), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + if (I->getMetadata("wi") != NULL) { + BV->setMetadata("wi", I->getMetadata("wi")); + BV->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + BV->insertBefore(J); + return BV; + } + + std::vector<Constant*> Mask(2); + HighIndx += EEType->getNumElements(); + Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); + Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); + + Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), + HEE->getOperand(0), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + if (I->getMetadata("wi") != NULL) { + BV->setMetadata("wi", I->getMetadata("wi")); + BV->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + BV->insertBefore(J); + return BV; + } + + Instruction *BV1 = InsertElementInst::Create( + UndefValue::get(VArgType), + L->getOperand(o), CV0, + getReplacementName(I, true, o, 1)); + if (I->getMetadata("wi") != NULL) { + BV1->setMetadata("wi", I->getMetadata("wi")); + BV1->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + + BV1->insertBefore(I); + + Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), + CV1, + getReplacementName(I, true, o, 2)); + if (J->getMetadata("wi") != NULL) { + BV2->setMetadata("wi",J->getMetadata("wi")); + BV2->setMetadata("wi_counter",J->getMetadata("wi_counter")); + } + BV2->insertBefore(J); + return BV2; + } + + // This function creates an array of values that will be used as the inputs + // to the vector instruction that fuses I with J. + void WIVectorize::getReplacementInputsForPair(LLVMContext& Context, + Instruction *I, Instruction *J, + SmallVector<Value *, 3> &ReplacedOperands, + bool FlipMemInputs) { + unsigned NumOperands = I->getNumOperands(); + + for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { + // Iterate backward so that we look at the store pointer + // first and know whether or not we need to flip the inputs. + + if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { + // This is the pointer for a load/store instruction. + ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, + FlipMemInputs); + continue; + } else if (isa<CallInst>(I)) { + Function *F = cast<CallInst>(I)->getCalledFunction(); + unsigned IID = F->getIntrinsicID(); + if (o == NumOperands-1) { + BasicBlock &BB = *I->getParent(); + + Module *M = BB.getParent()->getParent(); + Type *ArgTypeI = I->getType(); + Type *ArgTypeJ = J->getType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); + + // FIXME: is it safe to do this here? + ReplacedOperands[o] = Intrinsic::getDeclaration(M, + (Intrinsic::ID) IID, VArgType); + continue; + } else if (IID == Intrinsic::powi && o == 1) { + // The second argument of powi is a single integer and we've already + // checked that both arguments are equal. As a result, we just keep + // I's second argument. + ReplacedOperands[o] = I->getOperand(o); + continue; + } + } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { + ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); + continue; + } + + ReplacedOperands[o] = + getReplacementInput(Context, I, J, o, FlipMemInputs); + } + } + // As with the aliasing information, SCEV can also change because of + // vectorization. This information is used to compute relative pointer + // offsets; the necessary information will be cached here prior to + // fusion. + void WIVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<Value *> &LowPtrInsts) { + for (std::vector<Value *>::iterator PI = PairableInsts.begin(), + PIE = PairableInsts.end(); PI != PIE; ++PI) { + DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); + if (P == ChosenPairs.end()) continue; + + Instruction *I = cast<Instruction>(P->first); + Instruction *J = cast<Instruction>(P->second); + + if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<GetElementPtrInst>(I)) + continue; + + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; + int64_t OffsetInElmts; + if (!getPairPtrInfo( + I, J, IPtr, JPtr, IAlignment, JAlignment, IAddressSpace, + JAddressSpace, OffsetInElmts) || abs64(OffsetInElmts) != 1) { + if (!isa<GetElementPtrInst>(I)) + llvm_unreachable("Pre-fusion pointer analysis failed"); + } + Value *LowPI = (OffsetInElmts > 0) ? I : J; + LowPtrInsts.insert(LowPI); + } + } + + // This function creates two values that represent the outputs of the + // original I and J instructions. These are generally vector shuffles + // or extracts. In many cases, these will end up being unused and, thus, + // eliminated by later passes. + void WIVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, + Instruction *J, Instruction *K, + Instruction *&InsertionPt, + Instruction *&K1, Instruction *&K2, + bool FlipMemInputs) { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); + + if (isa<StoreInst>(I)) { + AA->replaceWithNewValue(I, K); + AA->replaceWithNewValue(J, K); + } else { + Type *IType = I->getType(); + Type *JType = J->getType(); + + VectorType *VType = getVecTypeForPair(IType, JType); + + if (IType->isVectorTy()) { + unsigned numElem = cast<VectorType>(IType)->getNumElements(); + std::vector<Constant*> Mask1(numElem), Mask2(numElem); + for (unsigned v = 0; v < numElem; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); + } + + K1 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask2 : Mask1), + getReplacementName(K, false, 1)); + K2 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask1 : Mask2), + getReplacementName(K, false, 2)); + storedSources.insert(ValuePair(FlipMemInputs ? K1 : K2, K)); + storedSources.insert(ValuePair(FlipMemInputs ? K2 : K1, K)); + flippedStoredSources.insert(ValuePair(K, FlipMemInputs ? K1 : K2)); + flippedStoredSources.insert(ValuePair(K, FlipMemInputs ? K2 : K1)); + Instruction* L = I; + Instruction* H = J; + if (FlipMemInputs) { + L = J; + H = I; + } + VPIteratorPair v1 = + flippedStoredSources.equal_range(L); + for (std::multimap<Value*, Value*>::iterator ii = v1.first; + ii != v1.second; ii++) { + storedSources.erase((*ii).second); + storedSources.insert(ValuePair((*ii).second,K)); + flippedStoredSources.insert(ValuePair(K, (*ii).second)); + storedSources.erase(L); + } + flippedStoredSources.erase(L); + VPIteratorPair v2 = flippedStoredSources.equal_range(H); + for (std::multimap<Value*, Value*>::iterator ji = v2.first; + ji != v2.second; ji++) { + storedSources.erase((*ji).second); + storedSources.insert(ValuePair((*ji).second,K)); + flippedStoredSources.insert(ValuePair(K, (*ji).second)); + storedSources.erase(H); + } + flippedStoredSources.erase(H); + } else { + K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, + getReplacementName(K, false, 1)); + K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, + getReplacementName(K, false, 2)); + storedSources.insert(ValuePair(K1,K)); + storedSources.insert(ValuePair(K2,K)); + flippedStoredSources.insert(ValuePair(K, K1)); + flippedStoredSources.insert(ValuePair(K, K2)); + } + if (I->getMetadata("wi") != NULL) { + K1->setMetadata("wi", I->getMetadata("wi")); + K1->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + if (J->getMetadata("wi") != NULL) { + K2->setMetadata("wi", J->getMetadata("wi")); + K2->setMetadata("wi_counter", J->getMetadata("wi_counter")); + } + + K1->insertAfter(K); + K2->insertAfter(K1); + InsertionPt = K2; + } + } + + // Move all uses of the function I (including pairing-induced uses) after J. + void WIVectorize::moveUsesOfIAfterJ(BasicBlock &/*BB*/, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *&InsertionPt, + Instruction *I, Instruction *J) { + // Skip to the first instruction past I. + BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); + + DenseSet<Value *> Users; + AliasSetTracker WriteSet(*AA); + for (; cast<Instruction>(L) != J;) { + if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) { + // Move this instruction + Instruction *InstToMove = L; ++L; + + InstToMove->removeFromParent(); + InstToMove->insertAfter(InsertionPt); + InsertionPt = InstToMove; + } else { + ++L; + } + } + } + + + // Collect all load instruction that are in the move set of a given first + // pair member. These loads depend on the first instruction, I, and so need + // to be moved after J (the second instruction) when the pair is fused. + void WIVectorize::collectPairLoadMoveSet(BasicBlock &BB, + DenseMap<Value *, Value *> &/*ChosenPairs*/, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *I) { + // Skip to the first instruction past I. + BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); + + DenseSet<Value *> Users; + AliasSetTracker WriteSet(*AA); + + // Note: We cannot end the loop when we reach J because J could be moved + // farther down the use chain by another instruction pairing. Also, J + // could be before I if this is an inverted input. + for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { + if (trackUsesOfI(Users, WriteSet, I, L)) { + if (L->mayReadFromMemory()) + LoadMoveSet.insert(ValuePair(L, I)); + } + } + } + + // In cases where both load/stores and the computation of their pointers + // are chosen for vectorization, we can end up in a situation where the + // aliasing analysis starts returning different query results as the + // process of fusing instruction pairs continues. Because the algorithm + // relies on finding the same use trees here as were found earlier, we'll + // need to precompute the necessary aliasing information here and then + // manually update it during the fusion process. + void WIVectorize::collectLoadMoveSet(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + std::multimap<Value *, Value *> &LoadMoveSet) { + for (std::vector<Value *>::iterator PI = PairableInsts.begin(), + PIE = PairableInsts.end(); PI != PIE; ++PI) { + DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); + if (P == ChosenPairs.end()) continue; + + Instruction *I = cast<Instruction>(P->first); + collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I); + } + } + + // This function fuses the chosen instruction pairs into vector instructions, + // taking care preserve any needed scalar outputs and, then, it reorders the + // remaining instructions as needed (users of the first member of the pair + // need to be moved to after the location of the second member of the pair + // because the vector instruction is inserted in the location of the pair's + // second member). + void WIVectorize::fuseChosenPairs(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs) { + LLVMContext& Context = BB.getContext(); + + // During the vectorization process, the order of the pairs to be fused + // could be flipped. So we'll add each pair, flipped, into the ChosenPairs + // list. After a pair is fused, the flipped pair is removed from the list. + std::vector<ValuePair> FlippedPairs; + FlippedPairs.reserve(ChosenPairs.size()); + for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), + E = ChosenPairs.end(); P != E; ++P) + FlippedPairs.push_back(ValuePair(P->second, P->first)); + for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), + E = FlippedPairs.end(); P != E; ++P) + ChosenPairs.insert(*P); + + std::multimap<Value *, Value *> LoadMoveSet; + collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); + DenseSet<Value *> LowPtrInsts; + collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts); + + DEBUG(dbgs() << "WIV: initial: \n" << BB << "\n"); + + for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { + DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); + if (P == ChosenPairs.end()) { + ++PI; + continue; + } + + if (getDepthFactor(P->first) == 0) { + // These instructions are not really fused, but are tracked as though + // they are. Any case in which it would be interesting to fuse them + // will be taken care of by InstCombine. + --NumFusedOps; + ++PI; + continue; + } + + Instruction *I = cast<Instruction>(P->first), + *J = cast<Instruction>(P->second); + + DEBUG(dbgs() << "WIV: fusing: " << *I << + " <-> " << *J << "\n"); + + // Remove the pair and flipped pair from the list. + DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); + assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); + ChosenPairs.erase(FP); + ChosenPairs.erase(P); + + bool FlipMemInputs = false; + if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<GetElementPtrInst>(I)) + FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end()); + unsigned NumOperands = I->getNumOperands(); + SmallVector<Value *, 3> ReplacedOperands(NumOperands); + getReplacementInputsForPair(Context, I, J, ReplacedOperands, + FlipMemInputs); + + // Make a copy of the original operation, change its type to the vector + // type and replace its operands with the vector operands. + Instruction *K = I->clone(); + if (I->hasName()) K->takeName(I); + + if (I->getMetadata("wi") != NULL) { + K->setMetadata("wi", I->getMetadata("wi")); + K->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + if (!isa<StoreInst>(K)) + K->mutateType(getVecTypeForPair(I->getType(), J->getType())); + + for (unsigned o = 0; o < NumOperands; ++o) + K->setOperand(o, ReplacedOperands[o]); + + // If we've flipped the memory inputs, make sure that we take the correct + // alignment. + if (FlipMemInputs) { + if (isa<StoreInst>(K)) + cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); + else + cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); + } + + K->insertAfter(J); + + // Instruction insertion point: + Instruction *InsertionPt = K; + Instruction *K1 = 0, *K2 = 0; + replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, + FlipMemInputs); + + // The use tree of the first original instruction must be moved to after + // the location of the second instruction. The entire use tree of the + // first instruction is disjoint from the input tree of the second + // (by definition), and so commutes with it. + + moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); + + if (!isa<StoreInst>(I)) { + I->replaceAllUsesWith(K1); + J->replaceAllUsesWith(K2); + AA->replaceWithNewValue(I, K1); + AA->replaceWithNewValue(J, K2); + } + + // Instructions that may read from memory may be in the load move set. + // Once an instruction is fused, we no longer need its move set, and so + // the values of the map never need to be updated. However, when a load + // is fused, we need to merge the entries from both instructions in the + // pair in case those instructions were in the move set of some other + // yet-to-be-fused pair. The loads in question are the keys of the map. + if (I->mayReadFromMemory()) { + std::vector<ValuePair> NewSetMembers; + VPIteratorPair IPairRange = LoadMoveSet.equal_range(I); + VPIteratorPair JPairRange = LoadMoveSet.equal_range(J); + for (std::multimap<Value *, Value *>::iterator N = IPairRange.first; + N != IPairRange.second; ++N) + NewSetMembers.push_back(ValuePair(K, N->second)); + for (std::multimap<Value *, Value *>::iterator N = JPairRange.first; + N != JPairRange.second; ++N) + NewSetMembers.push_back(ValuePair(K, N->second)); + for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), + AE = NewSetMembers.end(); A != AE; ++A) + LoadMoveSet.insert(*A); + } + + // Before removing I, set the iterator to the next instruction. + PI = llvm::next(BasicBlock::iterator(I)); + if (cast<Instruction>(PI) == J) + ++PI; + + SE->forgetValue(I); + SE->forgetValue(J); + I->eraseFromParent(); + J->eraseFromParent(); + } + + DEBUG(dbgs() << "WIV: final: \n" << BB << "\n"); + } + void WIVectorize::dropUnused(BasicBlock& BB) { + bool changed; + do{ + BasicBlock::iterator J = BB.end(); + BasicBlock::iterator I = llvm::prior(J); + changed = false; + while (I != BB.begin()) { + + if (isa<ShuffleVectorInst>(*I) || + isa<ExtractElementInst>(*I) || + isa<InsertElementInst>(*I) || + isa<BitCastInst>(*I)) { + + Value* V = dyn_cast<Value>(&(*I)); + + if (V && V->use_empty()) { + SE->forgetValue(&(*I)); + (*I).eraseFromParent(); + // removed instruction could have messed up things + // start again from the end + I = BB.end(); + J = llvm::prior(I); + changed = true; + } else { + J = llvm::prior(I); + } + } else { + J = llvm::prior(I); + } + I = J; + } + } while (changed); + } + + // Replace uses of alloca with new alloca. + // This includes getelementpointer, bitcast, load and store only + // atm. + // In case original alloca was array, the getelementpointer and bitcast apply. + void WIVectorize::replaceUses(BasicBlock& BB, + AllocaInst& oldAlloca, + AllocaInst& newAlloca, + int indx) { + + LLVMContext& Context = BB.getContext(); + Instruction::use_iterator useiter = oldAlloca.use_begin(); + + while (useiter != oldAlloca.use_end()) { + llvm::User* tmp = *useiter; + + if (isa<BitCastInst>(tmp)) { + // Create new bitcast from new alloca to same type + // as old bitcast had. This is situation where the + // alloca is casted to i8* followed by + // call void @llvm.lifetime.start(i64 -1, i8* %XYZ) nounwind + BitCastInst* bitCast = cast<BitCastInst>(tmp); + IRBuilder<> builder(bitCast); + BitCastInst* newBitcast = + cast<BitCastInst>(builder.CreateBitCast( + &newAlloca, bitCast->getDestTy(), bitCast->getName())); + + if (bitCast->getMetadata("wi") != NULL) { + newBitcast->setMetadata("wi", bitCast->getMetadata("wi")); + newBitcast->setMetadata("wi_counter", bitCast->getMetadata("wi_counter")); + } + + bitCast->replaceAllUsesWith(newBitcast); + AA->replaceWithNewValue(bitCast, newBitcast); + SE->forgetValue(bitCast); + bitCast->eraseFromParent(); + + useiter = oldAlloca.use_begin(); + continue; + } + + if (isa<GetElementPtrInst>(tmp)) { + // Original getelementpointer contains number of indexes + // that indicate how to access element of allocated + // memory. Since we changed the most inner type to + // array, we add index to that array such as: + // Original alloca: + // %A = alloca [20 x [8 x i32]], align 4 + // Original getelementpointer: + // %68 = getelementptr inbounds [20 x [8 x i32]]]* %A, i32 0, i32 %X, i32 0 + // New alloca: + // %A = alloca [20 x [8 x [2 x i32]]], align 4 + // new getelementpointer: + // %68 = getelementptr inbounds [20 x [8 x [2 x i32]]]* %A, i32 0, i32 %X, i32 0, i32 0 + + GetElementPtrInst* gep = cast<GetElementPtrInst>(tmp); + std::vector<llvm::Value *> gepArgs; + // Collect original indexes of getelementpointer + for (unsigned int i = 1; i <= gep->getNumIndices(); i++) { + gepArgs.push_back(gep->getOperand(i)); + } + // Add index to the newly created array + Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx); + gepArgs.push_back(V); + IRBuilder<> builder(gep); + GetElementPtrInst* newGep = + cast<GetElementPtrInst>( + builder.CreateGEP(&newAlloca, gepArgs, gep->getName())); + newGep->setIsInBounds(gep->isInBounds()); + + if (gep->getMetadata("wi") != NULL) { + newGep->setMetadata("wi", gep->getMetadata("wi")); + newGep->setMetadata("wi_counter", gep->getMetadata("wi_counter")); + } + + gep->replaceAllUsesWith(newGep); + AA->replaceWithNewValue(gep, newGep); + SE->forgetValue(gep); + gep->eraseFromParent(); + useiter = oldAlloca.use_begin(); + continue; + } + if (isa<StoreInst>(tmp)) { + // This is tricky, original alloca was for base type such + // as i32 or float so the variable was used directly. + // Now this is array so we have to add getelementpointer. + StoreInst* store = cast<StoreInst>(tmp); + std::vector<llvm::Value *> gepArgs; + Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx); + gepArgs.push_back(V); + IRBuilder<> builder(store); + GetElementPtrInst* newGep = + cast<GetElementPtrInst>(builder.CreateGEP(&newAlloca, gepArgs)); + if (store->getMetadata("wi") != NULL) { + newGep->setMetadata("wi", store->getMetadata("wi")); + newGep->setMetadata("wi_counter", store->getMetadata("wi_counter")); + } + + for (unsigned int i = 0; i < store->getNumOperands(); i++) { + // Either of store operands could be alloca, we either + // store to allocated memory, or we are storing the pointer + // of the memory (this is rather dumb thing to do). + if (store->getOperand(i) == &oldAlloca) { + IRBuilder<> builder(store); + BitCastInst* newBitcast = + cast<BitCastInst>(builder.CreateBitCast( + newGep, store->getOperand(i)->getType())); + if (store->getMetadata("wi") != NULL) { + newBitcast->setMetadata("wi", store->getMetadata("wi")); + newBitcast->setMetadata("wi_counter", store->getMetadata("wi_counter")); + } + store->setOperand(i, newBitcast); + } + } + useiter = oldAlloca.use_begin(); + continue; + } + if (isa<LoadInst>(tmp)) { + // This is tricky, original alloca was for base type such + // as i32 or float so the variable was used directly. + // Now this is array so we have to add getelementpointer. + + LoadInst* load = cast<LoadInst>(tmp); + std::vector<llvm::Value *> gepArgs; + Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx); + gepArgs.push_back(V); + IRBuilder<> builder(load); + GetElementPtrInst* newGep = + cast<GetElementPtrInst>(builder.CreateGEP(&newAlloca, gepArgs)); + if (load->getMetadata("wi") != NULL) { + newGep->setMetadata("wi", load->getMetadata("wi")); + newGep->setMetadata("wi_counter", load->getMetadata("wi_counter")); + } + + for (unsigned int i = 0; i < load->getNumOperands(); i++) { + // Find operand of load that was old alloca and + // use bitcast to point to to getelementpointer result. + // There must be better way how to do this. + if (load->getOperand(i) == &oldAlloca) { + IRBuilder<> builder(load); + BitCastInst* newBitcast = + cast<BitCastInst>(builder.CreateBitCast( + newGep, load->getOperand(i)->getType())); + if (load->getMetadata("wi") != NULL) { + newBitcast->setMetadata("wi", load->getMetadata("wi")); + newBitcast->setMetadata("wi_counter", load->getMetadata("wi_counter")); + } + load->setOperand(i, newBitcast); + } + } + useiter = oldAlloca.use_begin(); + continue; + } + useiter++; + } + } + + // Find new type for the vector alloca instruction + Type* WIVectorize::newAllocaType(Type* start, unsigned int width) { + + if (start->isArrayTy()) { + // If type is still array check what is allocated type + int numElm = cast<ArrayType>(start)->getNumElements(); + return ArrayType::get( + newAllocaType( + cast<SequentialType>(start)->getElementType(), + width) + , numElm); + } else if (start->isFirstClassType() && !start->isPointerTy()) { + // Recursion stopping point + // This should convert i32 to [width x i32] as base type of + // array + return ArrayType::get(start, width); + } else { + // Not recognized type, just return it, alloca won't be replaced + return start; + } + } + + // In case there is private variable in the kernel that does not fit into + // register (multidimensional array for example), there are alloca + // defined to create necessary memory space for variable. + // Those are defined then for each of the work items replicated. + // This pass attempts to combine those allocas to create 'interleaved' + // memory allocation that then can be accessed by vector loads and stores + // as described bellow: + // + // __kernel xyz() { + // + // int A[100][100][100][100]; + // ... + //} + // Will become after replication with 2 work items: + // + // %A = alloca [100 x [100 x [100 x i32]]], align 4 + // %A_wi_1_0_0 = alloca [100 x [100 x [100 x i32]]], align 4 + // + // This in will be converted here to : + // %A = alloca [100 x [100 x [100 x [2 x i32]]]], align 4 + // And respective getelementpointer instruction will + // be added additional paramter to select correct member from the pair. + // + // NOTE: This does work only for arrays ATM, the scalar type allocas + // as produced by phistoallocas pass required for the work loops + // are skipped for now. + + bool WIVectorize::vectorizeAllocas(BasicBlock& BB) { + + std::multimap<int, ValueVector*> allocas; + getCandidateAllocas(BB, allocas); + bool changed = false; + + for (std::multimap<int, ValueVector*>::iterator insIt = allocas.begin(); + insIt != allocas.end(); insIt++) { + IRBuilder<> builder( + BB.getParent()->getEntryBlock().getFirstInsertionPt()); + + ValueVector* tmpVec = (*insIt).second; + // Create as 'wide' alloca as number of elements found, + // could be smaller then vector width or larger. + // Should be same as work group dimensions for work item replicas or + // same as number of unrolled loops with work item loops. + unsigned int allocaWidth = tmpVec->size(); + // No point vectorizing one alloca only + if (allocaWidth <= 1) + continue; + + AllocaInst* I = cast<AllocaInst>((*tmpVec)[0]); + Type* startType = I->getAllocatedType(); + if (!startType->isArrayTy()) + continue; + // Find new type for alloca by recursively searching through multiple + // dimensions of array + Type* newType = newAllocaType(startType, allocaWidth); + + // No new type was found, alloca type not supported. + if (newType == startType) + continue; + + changed = true; + llvm::AllocaInst *alloca = + builder.CreateAlloca(newType, 0, I->getName().str() + "_allocamix"); + alloca->setAlignment(I->getAlignment()); + + if (I->getMetadata("wi") != NULL) { + alloca->setMetadata("wi", I->getMetadata("wi")); + alloca->setMetadata("wi_counter", I->getMetadata("wi_counter")); + } + + // Replace uses of first alloca with newly created one + MDNode* mi = I->getMetadata("wi"); + assert(mi->getNumOperands() == 3); + // Second operand of MDNode contains MDNode with XYZ tripplet. + MDNode* iXYZ= dyn_cast<MDNode>(mi->getOperand(2)); + assert(iXYZ->getNumOperands() == 4); + + int index = dyn_cast<ConstantInt>(iXYZ->getOperand(1))->getZExtValue(); + + replaceUses(BB, *I, *alloca, index); + SE->forgetValue(I); + I->eraseFromParent(); + + // Replaces uses of other allocas with newly created one + for (unsigned int i = 1; i < allocaWidth; i++) { + AllocaInst* J = cast<AllocaInst>((*tmpVec)[i]); + MDNode* mj = J->getMetadata("wi"); + assert(mj->getNumOperands() == 3); + MDNode* jXYZ= dyn_cast<MDNode>(mj->getOperand(2)); + assert(jXYZ->getNumOperands() == 4); + int index = + dyn_cast<ConstantInt>(jXYZ->getOperand(1))->getZExtValue(); + + replaceUses(BB, *J, *alloca, index); + SE->forgetValue(J); + J->eraseFromParent(); + } + } + return changed; + } + + // Pass closely repated to getCandidatePairs, except this one only + // picks AllocaInst and makes sure they are from different work items. + // It also returns all instances of AllocaInst at the same time. + bool WIVectorize::getCandidateAllocas(BasicBlock &BB, + std::multimap<int, ValueVector*>& temporary) { + + BasicBlock::iterator Start = BB.getFirstInsertionPt(); + BasicBlock::iterator E = BB.end(); + for (BasicBlock::iterator I = Start++; I != E; ++I) { + + if (!isa<AllocaInst>(I)) + continue; + // TODO: This is bit tricky, should it be possible + // to create vector of allocas that do not have metadata? + if (I->getMetadata("wi") == NULL) + continue; + + MDNode* md = I->getMetadata("wi"); + MDNode* mdCounter = I->getMetadata("wi_counter"); + MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1)); + + unsigned CI = cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue(); + unsigned RI = cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue(); + + std::multimap<int,ValueVector*>::iterator itb = temporary.lower_bound(CI); + std::multimap<int,ValueVector*>::iterator ite = temporary.upper_bound(CI); + ValueVector* tmpVec = NULL; + while(itb != ite) { + if (I->isSameOperationAs(cast<Instruction>((*(*itb).second)[0]))) { + // Test also if instructions are from same region. + MDNode* tmpMD = + cast<Instruction>((*(*itb).second)[0])->getMetadata("wi"); + MDNode* tmpRINode = dyn_cast<MDNode>(tmpMD->getOperand(1)); + unsigned tmpRI = + cast<ConstantInt>(tmpRINode->getOperand(1))->getZExtValue(); + if (RI == tmpRI) + tmpVec = (*itb).second; + } + itb++; + } + if (tmpVec == NULL) { + tmpVec = new ValueVector; + temporary.insert(std::pair<int, ValueVector*>(CI, tmpVec)); + } + tmpVec->push_back(I); + } + for (std::multimap<int, ValueVector*>::iterator insIt = temporary.begin(); + insIt != temporary.end(); insIt++) { + ValueVector* tmpVec = (*insIt).second; + for (unsigned j = 0; j < tmpVec->size()/2; j++) { + Instruction* I = cast<Instruction>((*tmpVec)[2*j]); + Instruction* J = cast<Instruction>((*tmpVec)[2*j+1]); + if (!areInstsCompatibleFromDifferentWi(I,J)) + continue; + } + } + return true; + } + +} +char WIVectorize::ID = 0; +RegisterPass<WIVectorize> + X("wi-vectorize", "Work item vectorization."); + +FunctionPass *createWIVectorizePass() { + return new WIVectorize(); +} + diff --git a/src/llvmopencl/WorkItemAliasAnalysis.cc b/src/llvmopencl/WorkItemAliasAnalysis.cc new file mode 100644 index 0000000..1d1fba7 --- /dev/null +++ b/src/llvmopencl/WorkItemAliasAnalysis.cc @@ -0,0 +1,119 @@ +/* + Copyright (c) 2012 Tampere University of Technology. + Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + */ +/** + * @file WorkItemAliasAnalysis.cc + * + * Definition of WorkItemAliasAnalysis class. + * + * @author Vladimír Guzma 2012 + */ + +#include "WorkItemAliasAnalysis.h" +using namespace pocl; + +// Register this pass... +char WorkItemAliasAnalysis::ID = 0; +RegisterPass<WorkItemAliasAnalysis> + X("wi-aa", "Work item alias analysis.", false, false); +// Register it also to pass group +RegisterAnalysisGroup<AliasAnalysis> Y(X); + +ImmutablePass *createWorkItemAliasAnalysisPass() { + return new WorkItemAliasAnalysis(); +} + +extern "C" { + ImmutablePass* + create_workitem_aa_plugin() { + return new WorkItemAliasAnalysis(); + } +} + +void +WorkItemAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AliasAnalysis::getAnalysisUsage(AU); +} + +/** + * Test if memory locations are from different work items from same region. + * Then they can not alias. + */ +AliasAnalysis::AliasResult +WorkItemAliasAnalysis::alias(const Location &LocA, + const Location &LocB) { + // If either of the memory references is empty, it doesn't matter what the + // pointer values are. This allows the code below to ignore this special + // case. + if (LocA.Size == 0 || LocB.Size == 0) + return NoAlias; + + // Pointers from different address spaces do not alias + if (cast<PointerType>(LocA.Ptr->getType())->getAddressSpace() != + cast<PointerType>(LocB.Ptr->getType())->getAddressSpace()) { + return NoAlias; + } + // In case code is created by pocl, we can also use metadata. + if (isa<Instruction>(LocA.Ptr) && isa<Instruction>(LocB.Ptr)) { + const Instruction* valA = dyn_cast<Instruction>(LocA.Ptr); + const Instruction* valB = dyn_cast<Instruction>(LocB.Ptr); + if (valA->getMetadata("wi") && valB->getMetadata("wi")) { + const MDNode* mdA = valA->getMetadata("wi"); + const MDNode* mdB = valB->getMetadata("wi"); + // Compare region ID. If they are same, different work items + // imply no aliasing. If regions are different or work items + // are same anything can happen. + // Fall back to other AAs. + const MDNode* mdRegionA = dyn_cast<MDNode>(mdA->getOperand(1)); + const MDNode* mdRegionB = dyn_cast<MDNode>(mdB->getOperand(1)); + ConstantInt* C1 = dyn_cast<ConstantInt>(mdRegionA->getOperand(1)); + ConstantInt* C2 = dyn_cast<ConstantInt>(mdRegionB->getOperand(1)); + if (C1->getValue() == C2->getValue()) { + // Now we have both locations from same region. Check for different + // work items. + MDNode* iXYZ= dyn_cast<MDNode>(mdA->getOperand(2)); + MDNode* jXYZ= dyn_cast<MDNode>(mdB->getOperand(2)); + assert(iXYZ->getNumOperands() == 4); + assert(jXYZ->getNumOperands() == 4); + + ConstantInt *CIX = dyn_cast<ConstantInt>(iXYZ->getOperand(1)); + ConstantInt *CJX = dyn_cast<ConstantInt>(jXYZ->getOperand(1)); + + ConstantInt *CIY = dyn_cast<ConstantInt>(iXYZ->getOperand(2)); + ConstantInt *CJY = dyn_cast<ConstantInt>(jXYZ->getOperand(2)); + + ConstantInt *CIZ = dyn_cast<ConstantInt>(iXYZ->getOperand(3)); + ConstantInt *CJZ = dyn_cast<ConstantInt>(jXYZ->getOperand(3)); + + if ( !(CIX->getValue() == CJX->getValue() + && CIY->getValue() == CJY->getValue() + && CIZ->getValue() == CJZ->getValue())) { + return NoAlias; + } + } + } + } + + // Forward the query to the next analysis. + return AliasAnalysis::alias(LocA, LocB); +} diff --git a/src/llvmopencl/WorkItemAliasAnalysis.h b/src/llvmopencl/WorkItemAliasAnalysis.h new file mode 100644 index 0000000..5c07a02 --- /dev/null +++ b/src/llvmopencl/WorkItemAliasAnalysis.h @@ -0,0 +1,75 @@ +/* + Copyright (c) 2012 Tampere University of Technology. + Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + */ +/** + * @file WorkItemAliasAnalysis.cc + * + * Definition of WorkItemAliasAnalysis class. + * + * @author Vladimír Guzma 2012 + */ + +#include "config.h" +#include <iostream> + +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Pass.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Metadata.h" +#include "llvm/Constants.h" +#else +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Constants.h" +#endif + +using namespace llvm; + +namespace pocl { +/// WorkItemAliasAnalysis - This is a simple alias analysis +/// implementation that uses pocl metadata to make sure memory accesses from +/// different work items are not aliasing. +class WorkItemAliasAnalysis : public llvm::ImmutablePass, public llvm::AliasAnalysis { +public: + static char ID; + WorkItemAliasAnalysis() : ImmutablePass(ID) {} + + /// getAdjustedAnalysisPointer - This method is used when a pass implements + /// an analysis interface through multiple inheritance. If needed, it + /// should override this to adjust the this pointer as needed for the + /// specified pass info. + virtual void *getAdjustedAnalysisPointer(AnalysisID PI) { + if (PI == &AliasAnalysis::ID) + return (AliasAnalysis*)this; + return this; + } + virtual void initializePass() { + InitializeAliasAnalysis(this); + } + + private: + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual AliasResult alias(const Location &LocA, const Location &LocB); + + }; +} + diff --git a/src/llvmopencl/Workgroup.cc b/src/llvmopencl/Workgroup.cc new file mode 100644 index 0000000..85cd84f --- /dev/null +++ b/src/llvmopencl/Workgroup.cc @@ -0,0 +1,619 @@ +// LLVM module pass to create the single function (fully inlined) +// and parallelized kernel for an OpenCL workgroup. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "Workgroup.h" + +#include "CanonicalizeBarriers.h" +#include "BarrierTailReplication.h" +#include "WorkitemReplication.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "config.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/TypeBuilder.h" +#include "llvm/BasicBlock.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/InstrTypes.h" +#include "llvm/Module.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/TypeBuilder.h" +#include "llvm/BasicBlock.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/InstrTypes.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Module.h" +#endif +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include <cstdio> +#include <map> +#include <iostream> + +#include "pocl.h" + +#define STRING_LENGTH 32 + +using namespace std; +using namespace llvm; +using namespace pocl; + +static void noaliasArguments(Function *F); +static Function *createLauncher(Module &M, Function *F); +static void privatizeContext(Module &M, Function *F); +static void createWorkgroup(Module &M, Function *F); +static void createWorkgroupFast(Module &M, Function *F); + +// extern cl::opt<string> Header; +// extern cl::list<int> LocalSize; + +/* The kernel to process in this kernel compiler launch. */ +cl::opt<string> +KernelName("kernel", + cl::desc("Kernel function name"), + cl::value_desc("kernel"), + cl::init("")); + +namespace llvm { + + typedef struct _pocl_context PoclContext; + + template<bool xcompile> class TypeBuilder<PoclContext, xcompile> { + public: + static StructType *get(LLVMContext &Context) { + if (size_t_width == 64) + { + return StructType::get + (TypeBuilder<types::i<32>, xcompile>::get(Context), + TypeBuilder<types::i<64>[3], xcompile>::get(Context), + TypeBuilder<types::i<64>[3], xcompile>::get(Context), + TypeBuilder<types::i<64>[3], xcompile>::get(Context), + NULL); + } + else if (size_t_width == 32) + { + return StructType::get + (TypeBuilder<types::i<32>, xcompile>::get(Context), + TypeBuilder<types::i<32>[3], xcompile>::get(Context), + TypeBuilder<types::i<32>[3], xcompile>::get(Context), + TypeBuilder<types::i<32>[3], xcompile>::get(Context), + NULL); + } + else + { + assert (false && "Unsupported size_t width."); + } + } + + /** + * We compile for various targets with various widths for the size_t + * type that depends on the pointer type. + * + * This should be set when the correct type is known. This is a hack + * until a better way is found. It's not thread safe, e.g. if one + * compiles multiple Modules for multiple different pointer widths in + * a same process with multiple threads. */ + static void setSizeTWidth(int width) { + size_t_width = width; + } + + enum Fields { + WORK_DIM, + NUM_GROUPS, + GROUP_ID, + GLOBAL_OFFSET + }; + private: + static int size_t_width; + + }; + + template<bool xcompile> + int TypeBuilder<PoclContext, xcompile>::size_t_width = 0; + +} // namespace llvm + +char Workgroup::ID = 0; +static RegisterPass<Workgroup> X("workgroup", "Workgroup creation pass"); + + +bool +Workgroup::runOnModule(Module &M) +{ + if (M.getPointerSize() == llvm::Module::Pointer64) + { + TypeBuilder<PoclContext, true>::setSizeTWidth(64); + } + else if (M.getPointerSize() == llvm::Module::Pointer32) + { + TypeBuilder<PoclContext, true>::setSizeTWidth(32); + } + else + { + assert (false && "Target has an unsupported pointer width."); + } + + for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) { + if (!i->isDeclaration()) + i->setLinkage(Function::InternalLinkage); + } + + for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) { + if (!isKernelToProcess(*i)) continue; + Function *L = createLauncher(M, i); + +#if defined LLVM_3_2 + L->addFnAttr(Attributes::NoInline); +#else + L->addFnAttr(Attribute::NoInline); +#endif + + privatizeContext(M, L); + + createWorkgroup(M, L); + createWorkgroupFast(M, L); + } + + Function *barrier = cast<Function> + (M.getOrInsertFunction("barrier", + Type::getVoidTy(M.getContext()), + Type::getInt32Ty(M.getContext()), + NULL)); + + BasicBlock *bb = BasicBlock::Create(M.getContext(), "", barrier); + ReturnInst::Create(M.getContext(), 0, bb); + + return true; +} + +/** + * Marks the pointer arguments to the kernel functions as noalias. + */ +static void +noaliasArguments(Function *F) +{ + for (unsigned i = 0, e = F->getFunctionType()->getNumParams(); i < e; ++i) + if (isa<PointerType> (F->getFunctionType()->getParamType(i))) + F->setDoesNotAlias(i + 1); // arg 0 is return type +} + +static Function * +createLauncher(Module &M, Function *F) +{ + SmallVector<Type *, 8> sv; + + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) + sv.push_back (i->getType()); + sv.push_back(TypeBuilder<PoclContext*, true>::get(M.getContext())); + + FunctionType *ft = FunctionType::get(Type::getVoidTy(M.getContext()), + ArrayRef<Type *> (sv), + false); + + std::string funcName = ""; + funcName = F->getName().str(); + + Function *L = Function::Create(ft, + Function::ExternalLinkage, + "_" + funcName, + &M); + + SmallVector<Value *, 8> arguments; + Function::arg_iterator ai = L->arg_begin(); + for (unsigned i = 0, e = F->getArgumentList().size(); i != e; ++i) { + arguments.push_back(ai); + ++ai; + } + + /* Copy the function attributes to transfer noalias etc. from the + original kernel which will be inlined into the launcher. */ + L->setAttributes(F->getAttributes()); + + Value *ptr, *v; + char s[STRING_LENGTH]; + GlobalVariable *gv; + + IRBuilder<> builder(BasicBlock::Create(M.getContext(), "", L)); + + ptr = builder.CreateStructGEP(ai, + TypeBuilder<PoclContext, true>::WORK_DIM); + gv = M.getGlobalVariable("_work_dim"); + if (gv != NULL) { + v = builder.CreateLoad(builder.CreateConstGEP1_32(ptr, 0)); + builder.CreateStore(v, gv); + } + + + int size_t_width = 32; + if (M.getPointerSize() == llvm::Module::Pointer64) + size_t_width = 64; + + ptr = builder.CreateStructGEP(ai, + TypeBuilder<PoclContext, true>::GROUP_ID); + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_group_id_%c", 'x' + i); + gv = M.getGlobalVariable(s); + if (gv != NULL) { + if (size_t_width == 64) + { + v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i)); + } + else + { + v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i)); + } + builder.CreateStore(v, gv); + } + } + + ptr = builder.CreateStructGEP(ai, + TypeBuilder<PoclContext, true>::NUM_GROUPS); + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_num_groups_%c", 'x' + i); + gv = M.getGlobalVariable(s); + if (gv != NULL) { + if (size_t_width == 64) + { + v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i)); + } + else + { + v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i)); + } + builder.CreateStore(v, gv); + } + } + + ptr = builder.CreateStructGEP(ai, + TypeBuilder<PoclContext, true>::GLOBAL_OFFSET); + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_global_offset_%c", 'x' + i); + gv = M.getGlobalVariable(s); + if (gv != NULL) { + if (size_t_width == 64) + { + v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i)); + } + else + { + v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i)); + } + builder.CreateStore(v, gv); + } + } + + CallInst *c = builder.CreateCall(F, ArrayRef<Value*>(arguments)); + builder.CreateRetVoid(); + + InlineFunctionInfo IFI; + InlineFunction(c, IFI); + + return L; +} + +static void +privatizeContext(Module &M, Function *F) +{ + char s[STRING_LENGTH]; + GlobalVariable *gv[3]; + AllocaInst *ai[3] = {NULL, NULL, NULL}; + + IRBuilder<> builder(F->getEntryBlock().getFirstNonPHI()); + + // Privatize _local_id + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_local_id_%c", 'x' + i); + gv[i] = M.getGlobalVariable(s); + if (gv[i] != NULL) { + ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(), + 0, s); + if(gv[i]->hasInitializer()) { + Constant *c = gv[i]->getInitializer(); + builder.CreateStore(c, ai[i]); + } + } + } + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + for (BasicBlock::iterator ii = i->begin(), ee = i->end(); + ii != ee; ++ii) { + for (int j = 0; j < 3; ++j) + ii->replaceUsesOfWith(gv[j], ai[j]); + } + } + + // Privatize _local_size + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_local_size_%c", 'x' + i); + gv[i] = M.getGlobalVariable(s); + if (gv[i] != NULL) { + ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(), + 0, s); + if(gv[i]->hasInitializer()) { + Constant *c = gv[i]->getInitializer(); + builder.CreateStore(c, ai[i]); + } + } + } + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + for (BasicBlock::iterator ii = i->begin(), ee = i->end(); + ii != ee; ++ii) { + for (int j = 0; j < 3; ++j) + ii->replaceUsesOfWith(gv[j], ai[j]); + } + } + + // Privatize _work_dim + gv[0] = M.getGlobalVariable("_work_dim"); + if (gv[0] != NULL) { + ai[0] = builder.CreateAlloca(gv[0]->getType()->getElementType(), + 0, "_work_dim"); + if(gv[0]->hasInitializer()) { + Constant *c = gv[0]->getInitializer(); + builder.CreateStore(c, ai[0]); + } + } + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + for (BasicBlock::iterator ii = i->begin(), ee = i->end(); + ii != ee; ++ii) { + ii->replaceUsesOfWith(gv[0], ai[0]); + } + } + + // Privatize _num_groups + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_num_groups_%c", 'x' + i); + gv[i] = M.getGlobalVariable(s); + if (gv[i] != NULL) { + ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(), + 0, s); + if(gv[i]->hasInitializer()) { + Constant *c = gv[i]->getInitializer(); + builder.CreateStore(c, ai[i]); + } + } + } + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + for (BasicBlock::iterator ii = i->begin(), ee = i->end(); + ii != ee; ++ii) { + for (int j = 0; j < 3; ++j) + ii->replaceUsesOfWith(gv[j], ai[j]); + } + } + + // Privatize _group_id + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_group_id_%c", 'x' + i); + gv[i] = M.getGlobalVariable(s); + if (gv[i] != NULL) { + ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(), + 0, s); + if(gv[i]->hasInitializer()) { + Constant *c = gv[i]->getInitializer(); + builder.CreateStore(c, ai[i]); + } + } + } + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + for (BasicBlock::iterator ii = i->begin(), ee = i->end(); + ii != ee; ++ii) { + for (int j = 0; j < 3; ++j) + ii->replaceUsesOfWith(gv[j], ai[j]); + } + } + + // Privatize _global_offset + for (int i = 0; i < 3; ++i) { + snprintf(s, STRING_LENGTH, "_global_offset_%c", 'x' + i); + gv[i] = M.getGlobalVariable(s); + if (gv[i] != NULL) { + ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(), + 0, s); + if(gv[i]->hasInitializer()) { + Constant *c = gv[i]->getInitializer(); + builder.CreateStore(c, ai[i]); + } + } + } + for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) { + for (BasicBlock::iterator ii = i->begin(), ee = i->end(); + ii != ee; ++ii) { + for (int j = 0; j < 3; ++j) + ii->replaceUsesOfWith(gv[j], ai[j]); + } + } +} + +/** + * Creates a work group launcher function (called KERNELNAME_workgroup) + * that assumes kernel pointer arguments are stored as pointers to the + * actual buffers and that scalar data is loaded from the default memory. + */ +static void +createWorkgroup(Module &M, Function *F) +{ + IRBuilder<> builder(M.getContext()); + + FunctionType *ft = + TypeBuilder<void(types::i<8>*[], + PoclContext*), true>::get(M.getContext()); + + std::string funcName = ""; + funcName = F->getName().str(); + + Function *workgroup = + dyn_cast<Function>(M.getOrInsertFunction(funcName + "_workgroup", ft)); + assert(workgroup != NULL); + + builder.SetInsertPoint(BasicBlock::Create(M.getContext(), "", workgroup)); + + Function::arg_iterator ai = workgroup->arg_begin(); + + SmallVector<Value*, 8> arguments; + int i = 0; + for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); + ii != ee; ++ii) { + Type *t = ii->getType(); + + Value *gep = builder.CreateGEP(ai, + ConstantInt::get(IntegerType::get(M.getContext(), 32), i)); + Value *pointer = builder.CreateLoad(gep); + + /* If it's a pass by value pointer argument, we just pass the pointer + * as is to the function, no need to load form it first. */ + Value *value; + if (ii->hasByValAttr()) { + value = builder.CreateBitCast(pointer, t); + } else { + value = builder.CreateBitCast(pointer, t->getPointerTo()); + value = builder.CreateLoad(value); + } + + arguments.push_back(value); + ++i; + } + + arguments.back() = ++ai; + + builder.CreateCall(F, ArrayRef<Value*>(arguments)); + builder.CreateRetVoid(); +} + +/** + * Creates a work group launcher more suitable for the heterogeneous + * host-device setup (called KERNELNAME_workgroup_fast). + * + * 1) Pointer arguments are stored directly as pointers to the + * buffers in the argument buffer. + * + * 2) Scalar values are loaded from the global memory address + * space. + * + * This should minimize copying of data and memory allocation + * at the device. + */ +static void +createWorkgroupFast(Module &M, Function *F) +{ + IRBuilder<> builder(M.getContext()); + + FunctionType *ft = + TypeBuilder<void(types::i<8>*[], + PoclContext*), true>::get(M.getContext()); + + std::string funcName = ""; + funcName = F->getName().str(); + Function *workgroup = + dyn_cast<Function>(M.getOrInsertFunction(funcName + "_workgroup_fast", ft)); + assert(workgroup != NULL); + + builder.SetInsertPoint(BasicBlock::Create(M.getContext(), "", workgroup)); + + Function::arg_iterator ai = workgroup->arg_begin(); + + SmallVector<Value*, 8> arguments; + int i = 0; + for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end(); + ii != ee; ++i, ++ii) { + Type *t = ii->getType(); + Value *gep = builder.CreateGEP(ai, + ConstantInt::get(IntegerType::get(M.getContext(), 32), i)); + Value *pointer = builder.CreateLoad(gep); + Value *bc = NULL; + + if (t->isPointerTy()) { + if (!ii->hasByValAttr()) { + /* Assume the pointer is directly in the arg array. */ + arguments.push_back(builder.CreateBitCast(pointer, t)); + continue; + } + + /* It's a pass by value pointer argument, use the underlying + * element type in subsequent load. */ + t = t->getPointerElementType(); + } + + /* Assume the pointer points to data in the global memory space. */ + bc = builder.CreateBitCast(pointer, + t->getPointerTo(POCL_ADDRESS_SPACE_GLOBAL)); + + /* If it's a pass by value pointer argument, we just pass the pointer + * as is to the function, no need to load from it first. */ + Value *value = builder.CreateBitCast( + pointer, t->getPointerTo(POCL_ADDRESS_SPACE_GLOBAL)); + if (!ii->hasByValAttr()) { + value = builder.CreateLoad(value); + } + + arguments.push_back(value); + } + + arguments.back() = ++ai; + + builder.CreateCall(F, ArrayRef<Value*>(arguments)); + builder.CreateRetVoid(); +} + + +/** + * Returns true in case the given function is a kernel that + * should be processed by the kernel compiler. + */ +bool +Workgroup::isKernelToProcess(const Function &F) +{ + const Module *m = F.getParent(); + + NamedMDNode *kernels = m->getNamedMetadata("opencl.kernels"); + if (kernels == NULL) { + if (KernelName == "") + return true; + if (F.getName() == KernelName) + return true; + + return false; + } + + for (unsigned i = 0, e = kernels->getNumOperands(); i != e; ++i) { + if (kernels->getOperand(i)->getOperand(0) == NULL) + continue; // globaldce might have removed uncalled kernels + Function *k = cast<Function>(kernels->getOperand(i)->getOperand(0)); + if (&F == k) + return true; + } + + return false; +} diff --git a/src/llvmopencl/Workgroup.h b/src/llvmopencl/Workgroup.h new file mode 100644 index 0000000..26d7bfd --- /dev/null +++ b/src/llvmopencl/Workgroup.h @@ -0,0 +1,48 @@ +// Header for Workgroup.cc module pass. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_WORKGROUP_H +#define _POCL_WORKGROUP_H + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Module.h" +#else +#include "llvm/IR/Module.h" +#endif +#include "llvm/Pass.h" + +namespace pocl { + class Workgroup : public llvm::ModulePass { + public: + static char ID; + + Workgroup() : ModulePass(ID) {} + + virtual bool runOnModule(llvm::Module &M); + + static bool isKernelToProcess(const llvm::Function &F); + + }; +} + +#endif diff --git a/src/llvmopencl/WorkitemHandler.cc b/src/llvmopencl/WorkitemHandler.cc new file mode 100644 index 0000000..90ed294 --- /dev/null +++ b/src/llvmopencl/WorkitemHandler.cc @@ -0,0 +1,278 @@ +// LLVM function pass to replicate the kernel body for all work items +// in a work group. +// +// Copyright (c) 2011-2012 Carlos Sánchez de La Lama / URJC and +// Pekka Jääskeläinen / TUT +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "config.h" +#include <sstream> +#include <iostream> + +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Metadata.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/Instructions.h" +#include "llvm/ValueSymbolTable.h" +#else +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ValueSymbolTable.h" +#endif +#include "llvm/Support/CommandLine.h" +#include "WorkitemHandler.h" +#include "Kernel.h" + +//#define DEBUG_REFERENCE_FIXING + +namespace pocl { + +using namespace llvm; + +cl::opt<bool> +AddWIMetadata("add-wi-metadata", cl::init(false), cl::Hidden, + cl::desc("Adds a work item identifier to each of the instruction in work items.")); + + +WorkitemHandler::WorkitemHandler(char& ID) : FunctionPass(ID) +{ +} + +bool +WorkitemHandler::runOnFunction(Function &F) +{ + return false; +} + +void +WorkitemHandler::Initialize(Kernel *K) +{ + llvm::Module *M = K->getParent(); + + LocalSizeX = 3; + LocalSizeY = 1; + LocalSizeZ = 1; + +// TODO: are we searching reqd_workgroup_size here? If so, we need to enforce it. + llvm::NamedMDNode *size_info = M->getNamedMetadata("opencl.kernel_wg_size_info"); + if (size_info) { + for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) { + llvm::MDNode *KernelSizeInfo = size_info->getOperand(i); + if (KernelSizeInfo->getOperand(0) == K) { + LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue(); + LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue(); + LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue(); + } + } + } + + llvm::Type *localIdType; + if (M->getPointerSize() == llvm::Module::Pointer64) + size_t_width = 64; + else if (M->getPointerSize() == llvm::Module::Pointer32) + size_t_width = 32; + else + assert (false && "Only 32 and 64 bit size_t widths supported."); + + localIdType = IntegerType::get(K->getContext(), size_t_width); + + localIdZ = M->getOrInsertGlobal(POCL_LOCAL_ID_Z_GLOBAL, localIdType); + localIdY = M->getOrInsertGlobal(POCL_LOCAL_ID_Y_GLOBAL, localIdType); + localIdX = M->getOrInsertGlobal(POCL_LOCAL_ID_X_GLOBAL, localIdType); + + GlobalVariable *gvx = M->getNamedGlobal(POCL_LOCAL_ID_X_GLOBAL); + GlobalVariable *gvy = M->getNamedGlobal(POCL_LOCAL_ID_Y_GLOBAL); + GlobalVariable *gvz = M->getNamedGlobal(POCL_LOCAL_ID_Z_GLOBAL); + gvx->setSection(StringRef("far")); + gvy->setSection(StringRef("far")); + gvz->setSection(StringRef("far")); + + //Value *lsx = M->getOrInsertGlobal("_local_size_x", localIdType); + //Value *lsy = M->getOrInsertGlobal("_local_size_y", localIdType); + //Value *lsz = M->getOrInsertGlobal("_local_size_z", localIdType); + //GlobalVariable *gsx = M->getNamedGlobal("_local_size_x"); + //GlobalVariable *gsy = M->getNamedGlobal("_local_size_y"); + //GlobalVariable *gsz = M->getNamedGlobal("_local_size_z"); + //gsx->setSection(StringRef("far")); + //gsy->setSection(StringRef("far")); + //gsz->setSection(StringRef("far")); +} + +bool +WorkitemHandler::dominatesUse +(llvm::DominatorTree *DT, Instruction &I, unsigned i) { + Instruction *Op = cast<Instruction>(I.getOperand(i)); + BasicBlock *OpBlock = Op->getParent(); + PHINode *PN = dyn_cast<PHINode>(&I); + + // DT can handle non phi instructions for us. + if (!PN) + { + // Definition must dominate use unless use is unreachable! + return Op->getParent() == I.getParent() || + DT->dominates(Op, &I); + } + + // PHI nodes are more difficult than other nodes because they actually + // "use" the value in the predecessor basic blocks they correspond to. + unsigned j = PHINode::getIncomingValueNumForOperand(i); + BasicBlock *PredBB = PN->getIncomingBlock(j); + return (PredBB && DT->dominates(OpBlock, PredBB)); +} + +/* Fixes the undominated variable uses. + + These appear when a conditional barrier kernel is replicated to + form a copy of the *same basic block* in the alternative + "barrier path". + + E.g., from + + A -> [exit], A -> B -> [exit] + + a replicated CFG as follows, is created: + + A1 -> (T) A2 -> [exit1], A1 -> (F) A2' -> B1, B2 -> [exit2] + + The regions are correct because of the barrier semantics + of "all or none". In case any barrier enters the [exit1] + from A1, all must (because there's a barrier in the else + branch). + + Here at A2 and A2' one creates the same variables. + However, B2 does not know which copy + to refer to, the ones created in A2 or ones in A2' (correct). + The mapping data contains only one possibility, the + one that was placed there last. Thus, the instructions in B2 + might end up referring to the variables defined in A2 + which do not nominate them. + + The variable references are fixed by exploiting the knowledge + of the naming convention of the cloned variables. + + One potential alternative way would be to collect the refmaps per BB, + not globally. Then as a final phase traverse through the + basic blocks starting from the beginning and propagating the + reference data downwards, the data from the new BB overwriting + the old one. This should ensure the reachability without + the costly dominance analysis. +*/ +bool +WorkitemHandler::fixUndominatedVariableUses(llvm::DominatorTree *DT, + llvm::Function &F) +{ + bool changed = false; + DT->runOnFunction(F); + + for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) + { + llvm::BasicBlock *bb = i; + for (llvm::BasicBlock::iterator ins = bb->begin(), inse = bb->end(); + ins != inse; ++ins) + { + for (unsigned opr = 0; opr < ins->getNumOperands(); ++opr) + { + if (!isa<Instruction>(ins->getOperand(opr))) continue; + Instruction *operand = cast<Instruction>(ins->getOperand(opr)); + if (dominatesUse(DT, *ins, opr)) + continue; +#ifdef DEBUG_REFERENCE_FIXING + std::cout << "### dominance error!" << std::endl; + operand->dump(); + std::cout << "### does not dominate:" << std::endl; + ins->dump(); +#endif + StringRef baseName; + std::pair< StringRef, StringRef > pieces = + operand->getName().rsplit('.'); + if (pieces.second.startswith("pocl_")) + baseName = pieces.first; + else + baseName = operand->getName(); + + Value *alternative = NULL; + + unsigned int copy_i = 0; + do { + std::ostringstream alternativeName; + alternativeName << baseName.str(); + if (copy_i > 0) + alternativeName << ".pocl_" << copy_i; + + alternative = + F.getValueSymbolTable().lookup(alternativeName.str()); + + if (alternative != NULL) + { + ins->setOperand(opr, alternative); + if (dominatesUse(DT, *ins, opr)) + break; + } + + if (copy_i > 10000 && alternative == NULL) + break; /* ran out of possibilities */ + ++copy_i; + } while (true); + + if (alternative != NULL) + { +#ifdef DEBUG_REFERENCE_FIXING + std::cout << "### found the alternative:" << std::endl; + alternative->dump(); +#endif + changed |= true; + } else { +#ifdef DEBUG_REFERENCE_FIXING + std::cout << "### didn't fiund an alternative for" << std::endl; + operand->dump(); + std::cerr << "### BB:" << std::endl; + operand->getParent()->dump(); + std::cerr << "### the user BB:" << std::endl; + ins->getParent()->dump(); +#endif + std::cerr << "Could not find a dominating alternative variable." << std::endl; + abort(); + } + } + } + } + return changed; +} + +/** + * Moves the phi nodes in the beginning of the src to the beginning of + * the dst. + * + * MergeBlockIntoPredecessor function from llvm discards the phi nodes + * of the replicated BB because it has only one entry. + */ +void +WorkitemHandler::movePhiNodes(llvm::BasicBlock* src, llvm::BasicBlock* dst) +{ + while (PHINode *PN = dyn_cast<PHINode>(src->begin())) + PN->moveBefore(dst->getFirstNonPHI()); +} + + +} // namespace pocl diff --git a/src/llvmopencl/WorkitemHandler.h b/src/llvmopencl/WorkitemHandler.h new file mode 100644 index 0000000..6654fa8 --- /dev/null +++ b/src/llvmopencl/WorkitemHandler.h @@ -0,0 +1,73 @@ +// Header for WorkitemHandler, a parent class for all implementations of +// work item handling. +// +// Copyright (c) 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_WORKITEM_HANDLER_H +#define _POCL_WORKITEM_HANDLER_H + +#include "config.h" +#if (defined LLVM_3_1 or defined LLVM_3_2) +#include "llvm/Function.h" +#else +#include "llvm/IR/Function.h" +#endif + +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" + +namespace llvm { + class DominatorTree; +} + +namespace pocl { + class Workgroup; + class Kernel; + + class WorkitemHandler : public llvm::FunctionPass { + public: + + WorkitemHandler(char& ID); + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const = 0; + virtual bool runOnFunction(llvm::Function &F); + + virtual void Initialize(pocl::Kernel *K); + + protected: + + void movePhiNodes(llvm::BasicBlock* src, llvm::BasicBlock* dst); + bool fixUndominatedVariableUses(llvm::DominatorTree *DT, llvm::Function &F); + bool dominatesUse(llvm::DominatorTree *DT, llvm::Instruction &I, unsigned i); + + int LocalSizeX, LocalSizeY, LocalSizeZ; + + unsigned size_t_width; + + /* The global variables that store the current local id. */ + llvm::Value *localIdZ, *localIdY, *localIdX; + + }; + + extern llvm::cl::opt<bool> AddWIMetadata; +} + +#endif diff --git a/src/llvmopencl/WorkitemHandlerChooser.cc b/src/llvmopencl/WorkitemHandlerChooser.cc new file mode 100644 index 0000000..4fcd226 --- /dev/null +++ b/src/llvmopencl/WorkitemHandlerChooser.cc @@ -0,0 +1,111 @@ +// LLVM function pass to select the best way to create a work group +// function for a kernel and work group size. +// +// Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define DEBUG_TYPE "workitem-loops" + +#include "WorkitemHandlerChooser.h" +#include "WorkitemLoops.h" +#include "WorkitemReplication.h" +#include "Workgroup.h" +#include "CanonicalizeBarriers.h" +#include "Kernel.h" + +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/LoopInfo.h" + +#include <iostream> + +using namespace llvm; +using namespace pocl; + +namespace { + static + RegisterPass<WorkitemHandlerChooser> X( + "workitem-handler-chooser", + "Finds the best way to handle work-items to produce a multi-WG function.", + false, false); + +} + +namespace pocl { + +char WorkitemHandlerChooser::ID = 0; + +void +WorkitemHandlerChooser::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.setPreservesAll(); +} + + +bool +WorkitemHandlerChooser::runOnFunction(Function &F) +{ + if (!Workgroup::isKernelToProcess(F)) + return false; + + Kernel *K = cast<Kernel> (&F); + Initialize(K); + +#if 0 + std::string method = "auto"; + if (getenv("POCL_WORK_GROUP_METHOD") != NULL) + { + method = getenv("POCL_WORK_GROUP_METHOD"); + if (method == "repl" || method == "workitemrepl") + chosenHandler_ = POCL_WIH_FULL_REPLICATION; + else if (method == "loops" || method == "workitemloops") + chosenHandler_ = POCL_WIH_LOOPS; + else if (method != "auto") + { + std::cerr << "Unknown work group generation method. Using 'auto'." << std::endl; + method = "auto"; + } + } + + if (method == "auto") + { + size_t ReplThreshold = 2; + if (getenv("POCL_FULL_REPLICATION_THRESHOLD") != NULL) + { + ReplThreshold = atoi(getenv("POCL_FULL_REPLICATION_THRESHOLD")); + } + + if (LocalSizeX*LocalSizeY*LocalSizeZ <= ReplThreshold) + { + chosenHandler_ = POCL_WIH_FULL_REPLICATION; + } + else + { + chosenHandler_ = POCL_WIH_LOOPS; + } + } +#else + chosenHandler_ = POCL_WIH_LOOPS; +#endif + + return false; +} + +} diff --git a/src/llvmopencl/WorkitemHandlerChooser.h b/src/llvmopencl/WorkitemHandlerChooser.h new file mode 100644 index 0000000..ae317e3 --- /dev/null +++ b/src/llvmopencl/WorkitemHandlerChooser.h @@ -0,0 +1,52 @@ +// Header for WorkitemHandlerChooser function pass. +// +// Copyright (c) 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_WORKITEM_HANDLER_CHOOSER_H +#define _POCL_WORKITEM_HANDLER_CHOOSER_H + +#include "WorkitemHandler.h" + +namespace pocl { + class Workgroup; + + class WorkitemHandlerChooser : public pocl::WorkitemHandler { + public: + static char ID; + + enum WorkitemHandlerType { + POCL_WIH_FULL_REPLICATION, + POCL_WIH_LOOPS + }; + + WorkitemHandlerChooser() : pocl::WorkitemHandler(ID), + chosenHandler_(POCL_WIH_LOOPS) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnFunction(llvm::Function &F); + + WorkitemHandlerType chosenHandler() { return chosenHandler_; } + private: + WorkitemHandlerType chosenHandler_; + }; +} + +#endif diff --git a/src/llvmopencl/WorkitemLoops.cc b/src/llvmopencl/WorkitemLoops.cc new file mode 100644 index 0000000..91eb055 --- /dev/null +++ b/src/llvmopencl/WorkitemLoops.cc @@ -0,0 +1,1061 @@ +// LLVM function pass to create loops that run all the work items +// in a work group while respecting barrier synchronization points. +// +// Copyright (c) 2012-2014 Pekka Jääskeläinen / Tampere University of Technology +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define DEBUG_TYPE "workitem-loops" + +#include "WorkitemLoops.h" +#include "Workgroup.h" +#include "Barrier.h" +#include "Kernel.h" +#include "config.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/CommandLine.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/TypeBuilder.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/ValueSymbolTable.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/TypeBuilder.h" +#include "llvm/DataLayout.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/ValueSymbolTable.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#endif +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#include <llvm/Support/InstIterator.h> +#include "WorkitemHandlerChooser.h" + +#include <iostream> +#include <map> +#include <sstream> +#include <vector> + +//#define DUMP_RESULT_CFG + +#ifdef DUMP_RESULT_CFG +#include "llvm/Analysis/CFGPrinter.h" +#endif + +//#define DEBUG_WORK_ITEM_LOOPS + +using namespace llvm; +using namespace pocl; + +namespace { + static + RegisterPass<WorkitemLoops> X("workitemloops", + "Workitem loop generation pass"); +} + +char WorkitemLoops::ID = 0; + +void +WorkitemLoops::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<DominatorTree>(); + AU.addRequired<PostDominatorTree>(); + AU.addRequired<LoopInfo>(); +// TODO - Removed due to compilation error +#if 0 +#ifdef LLVM_3_1 + AU.addRequired<TargetData>(); +#else + AU.addRequired<DataLayout>(); +#endif +#endif + AU.addRequired<pocl::WorkitemHandlerChooser>(); +} + +bool +WorkitemLoops::runOnFunction(Function &F) +{ + if (!Workgroup::isKernelToProcess(F)) + return false; + + if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() != + pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS) + return false; + + DT = &getAnalysis<DominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + PDT = &getAnalysis<PostDominatorTree>(); + + tempInstructionIndex = 0; + +#if 0 + std::cerr << "### original:" << std::endl; + F.viewCFG(); +#endif + + bool changed = ProcessFunction(F); +#ifdef DUMP_RESULT_CFG + FunctionPass* cfgPrinter = createCFGOnlyPrinterPass(); + cfgPrinter->runOnFunction(F); +#endif + +#if 0 + std::cerr << "### after:" << std::endl; + F.viewCFG(); +#endif + + changed |= fixUndominatedVariableUses(DT, F); + +#if 0 + /* Split large BBs so we can print the Dot without it crashing. */ + bool fchanged = false; + const int MAX_INSTRUCTIONS_PER_BB = 70; + do { + fchanged = false; + for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { + BasicBlock *b = i; + + if (b->size() > MAX_INSTRUCTIONS_PER_BB + 1) + { + int count = 0; + BasicBlock::iterator splitPoint = b->begin(); + while (count < MAX_INSTRUCTIONS_PER_BB || isa<PHINode>(splitPoint)) + { + ++splitPoint; + ++count; + } + SplitBlock(b, splitPoint, this); + fchanged = true; + break; + } + } + + } while (fchanged); + + F.viewCFG(); +#endif + contextArrays.clear(); + tempInstructionIds.clear(); + + return changed; +} + +std::pair<llvm::BasicBlock *, llvm::BasicBlock *> +WorkitemLoops::CreateLoopAround +(ParallelRegion ®ion, + llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB, + bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim, + bool addIncBlock, llvm::Instruction *lsizeDim) +{ + assert (localIdVar != NULL); + + /* + + Generate a structure like this for each loop level (x,y,z): + + for.init: + + ; if peeledFirst is false: + store i32 0, i32* %_local_id_x, align 4 + + ; if peeledFirst is true (assume the 0,0,0 iteration has been executed earlier) + ; assume _local_id_x_first is is initialized to 1 in the peeled pregion copy + store _local_id_x_first, i32* %_local_id_x, align 4 + store i32 0, %_local_id_x_first + + br label %for.body + + for.body: + + ; the parallel region code here + + br label %for.inc + + for.inc: + + ; Separated inc and cond check blocks for easier loop unrolling later on. + ; Can then chain N times for.body+for.inc to unroll. + + %2 = load i32* %_local_id_x, align 4 + %inc = add nsw i32 %2, 1 + + store i32 %inc, i32* %_local_id_x, align 4 + br label %for.cond + + for.cond: + + ; loop header, compare the id to the local size + %0 = load i32* %_local_id_x, align 4 + %cmp = icmp ult i32 %0, i32 123 + br i1 %cmp, label %for.body, label %for.end + + for.end: + + OPTIMIZE: Use a separate iteration variable across all the loops to iterate the context + data arrays to avoid needing multiplications to find the correct location, and to + enable easy vectorization of loading the context data when there are parallel iterations. + */ + + llvm::BasicBlock *loopBodyEntryBB = entryBB; + llvm::LLVMContext &C = loopBodyEntryBB->getContext(); + llvm::Function *F = loopBodyEntryBB->getParent(); + loopBodyEntryBB->setName("pregion.for.body"); + + assert (exitBB->getTerminator()->getNumSuccessors() == 1); + + llvm::BasicBlock *oldExit = exitBB->getTerminator()->getSuccessor(0); + + llvm::BasicBlock *forInitBB = + BasicBlock::Create(C, "pregion.for.init", F, loopBodyEntryBB); + + llvm::BasicBlock *loopEndBB = + BasicBlock::Create(C, "pregion.for.end", F, exitBB); + + llvm::BasicBlock *forCondBB = + BasicBlock::Create(C, "pregion.for.cond", F, exitBB); + + DT->runOnFunction(*F); + + // F->viewCFG(); + /* Fix the old edges jumping to the region to jump to the basic block + that starts the created loop. Back edges should still point to the + old basic block so we preserve the old loops. */ + BasicBlockVector preds; + llvm::pred_iterator PI = + llvm::pred_begin(entryBB), + E = llvm::pred_end(entryBB); + + for (; PI != E; ++PI) + { + llvm::BasicBlock *bb = *PI; + preds.push_back(bb); + } + + for (BasicBlockVector::iterator i = preds.begin(); + i != preds.end(); ++i) + { + llvm::BasicBlock *bb = *i; + /* Do not fix loop edges inside the region. The loop + is replicated as a whole to the body of the wi-loop.*/ + if (DT->dominates(loopBodyEntryBB, bb)) + continue; + bb->getTerminator()->replaceUsesOfWith(loopBodyEntryBB, forInitBB); + } + + IRBuilder<> builder(forInitBB); + + if (peeledFirst) + { + builder.CreateStore(builder.CreateLoad(localIdXFirstVar), localIdVar); + builder.CreateStore + (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdXFirstVar); + } + else + { + builder.CreateStore + (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdVar); + } + + builder.CreateBr(loopBodyEntryBB); + + exitBB->getTerminator()->replaceUsesOfWith(oldExit, forCondBB); + if (addIncBlock) + { + AppendIncBlock(exitBB, localIdVar); + } + + builder.SetInsertPoint(forCondBB); + + llvm::Value *cmpResult; + if (lsizeDim == NULL) + { + cmpResult = + builder.CreateICmpULT + (builder.CreateLoad(localIdVar), + (ConstantInt::get + (IntegerType::get(C, size_t_width), + LocalSizeForDim)) + ); + } + else + { + cmpResult = + builder.CreateICmpULT + (builder.CreateLoad(localIdVar), + lsizeDim + ); + } + + Instruction *loopBranch = + builder.CreateCondBr(cmpResult, loopBodyEntryBB, loopEndBB); + + /* Add the metadata to mark a parallel loop. The metadata + refer to a loop-unique dummy metadata that is not merged + automatically. */ + + /* This creation of the identifier metadata is copied from + LLVM's MDBuilder::createAnonymousTBAARoot(). */ + MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Value*>()); + MDNode *Root = MDNode::get(C, Dummy); + // At this point we have + // !0 = metadata !{} <- dummy + // !1 = metadata !{metadata !0} <- root + // Replace the dummy operand with the root node itself and delete the dummy. + Root->replaceOperandWith(0, Root); + MDNode::deleteTemporary(Dummy); + // We now have + // !1 = metadata !{metadata !1} <- self-referential root + + loopBranch->setMetadata("llvm.loop.parallel", Root); + region.AddParallelLoopMetadata(Root); + + builder.SetInsertPoint(loopEndBB); + builder.CreateBr(oldExit); + + return std::make_pair(forInitBB, loopEndBB); +} + +ParallelRegion* +WorkitemLoops::RegionOfBlock(llvm::BasicBlock *bb) +{ + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *region = (*i); + if (region->HasBlock(bb)) return region; + } + return NULL; +} + +// PreAnalyze kernel function, find out dimension (borrowed from wga) +// PreCreate local sizes which are workgroup invariant +void WorkitemLoops::FindKernelDim(Function &F) +{ + maxDim = 1; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) + if (CallInst * callInst = dyn_cast<CallInst>(&*I)) + { + if (!callInst->getCalledFunction()) continue; + std::string functionName(callInst->getCalledFunction()->getName()); + + if (functionName == "get_local_id" || + functionName == "get_global_id") + { + Value *arg = callInst->getArgOperand(0); + if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg)) + { + unsigned int dimIdx = constInt->getSExtValue(); + dimIdx = (MAX_DIMENSIONS-1 < dimIdx) ? MAX_DIMENSIONS-1 : dimIdx; + maxDim = (maxDim < dimIdx + 1) ? dimIdx+1 : maxDim; + } + + /*------------------------------------------------------------- + * if the work group function has a variable argument, then + * assume worst case and return 3 loop levels are needed. + *------------------------------------------------------------*/ + else + { + maxDim = 3; + break; + } + } + } + + llvm::Module *M = F.getParent(); + llvm::Type *Int32 = IntegerType::get(M->getContext(), 32); + FunctionType *ft = FunctionType::get + (/*Result=*/ Int32, + /*Params=*/ Int32, + /*isVarArg=*/ false); + Function *f_localsize = + dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft)); + SmallVector<Value *, 4> argsx, argsy, argsz; + argsx.push_back(ConstantInt::get(Int32, 0)); + lsizeX = CallInst::Create(f_localsize, ArrayRef<Value *>(argsx)); + if (maxDim > 1) + { + argsy.push_back(ConstantInt::get(Int32, 1)); + lsizeY = CallInst::Create(f_localsize, ArrayRef<Value *>(argsy)); + } + if (maxDim > 2) + { + argsz.push_back(ConstantInt::get(Int32, 2)); + lsizeZ = CallInst::Create(f_localsize, ArrayRef<Value *>(argsz)); + } +} + +bool +WorkitemLoops::ProcessFunction(Function &F) +{ + Kernel *K = cast<Kernel> (&F); + Initialize(K); + +#if 0 // TODO: do something for reqd_work_group_size + unsigned workItemCount = LocalSizeX*LocalSizeY*LocalSizeZ; + if (workItemCount == 1) + { + K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ); + ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0); + return true; + } +#endif + + FindKernelDim(F); + + original_parallel_regions = + K->getParallelRegions(LI); + + IRBuilder<> builder(F.getEntryBlock().getFirstInsertionPt()); + localIdXFirstVar = + builder.CreateAlloca + (IntegerType::get(F.getContext(), size_t_width), 0, ".pocl.local_id_x_init"); + + // F.viewCFGOnly(); + +#if 0 + std::cerr << "### Original" << std::endl; + F.viewCFG(); +#endif + +#if 0 + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *region = (*i); + region->InjectRegionPrintF(); + region->InjectVariablePrintouts(); + } +#endif + + /* Count how many parallel regions share each entry node to + detect diverging regions that need to be peeled. */ + std::map<llvm::BasicBlock*, int> entryCounts; + + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *region = (*i); +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### Adding context save/restore for PR: "; + region->dumpNames(); +#endif + FixMultiRegionVariables(region); + entryCounts[region->entryBB()]++; + } + +#if 0 + std::cerr << "### After context code addition:" << std::endl; + F.viewCFG(); +#endif + std::map<ParallelRegion*, bool> peeledRegion; + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + + llvm::ValueToValueMapTy reference_map; + ParallelRegion *original = (*i); + +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### handling region:" << std::endl; + original->dumpNames(); + //F.viewCFGOnly(); +#endif + + /* In case of conditional barriers, the first iteration + has to be peeled so we know which branch to execute + with the work item loop. In case there are more than one + parallel region sharing an entry BB, it's a diverging + region. + + Post dominance of entry by exit does not work in case the + region is inside a loop and the exit block is in the path + towards the loop exit (and the function exit). + */ + bool peelFirst = entryCounts[original->entryBB()] > 1; + + peeledRegion[original] = peelFirst; + + std::pair<llvm::BasicBlock *, llvm::BasicBlock *> l; + // the original predecessor nodes of which successor + // should be fixed if not peeling + BasicBlockVector preds; + + bool unrolled = false; + if (peelFirst) + { +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### conditional region, peeling the first iteration" << std::endl; +#endif + ParallelRegion *replica = + original->replicate(reference_map, ".peeled_wi"); + replica->chainAfter(original); + replica->purge(); + + l = std::make_pair(replica->entryBB(), replica->exitBB()); + } + else + { + llvm::pred_iterator PI = + llvm::pred_begin(original->entryBB()), + E = llvm::pred_end(original->entryBB()); + + for (; PI != E; ++PI) + { + llvm::BasicBlock *bb = *PI; + if (DT->dominates(original->entryBB(), bb) && + (RegionOfBlock(original->entryBB()) == + RegionOfBlock(bb))) + continue; + preds.push_back(bb); + } + +#if 0 + int unrollCount; + if (getenv("POCL_WILOOPS_MAX_UNROLL_COUNT") != NULL) + unrollCount = atoi(getenv("POCL_WILOOPS_MAX_UNROLL_COUNT")); + else + unrollCount = 1; + /* Find a two's exponent unroll count, if available. */ + while (unrollCount >= 1) + { + if (LocalSizeX % unrollCount == 0 && + unrollCount <= LocalSizeX) + { + break; + } + unrollCount /= 2; + } + + if (unrollCount > 1) { + ParallelRegion *prev = original; + llvm::BasicBlock *lastBB = + AppendIncBlock(original->exitBB(), localIdX); + original->AddBlockAfter(lastBB, original->exitBB()); + original->SetExitBB(lastBB); + + if (AddWIMetadata) + original->AddIDMetadata(F.getContext(), 0); + + for (int c = 1; c < unrollCount; ++c) + { + ParallelRegion *unrolled = + original->replicate(reference_map, ".unrolled_wi"); + unrolled->chainAfter(prev); + prev = unrolled; + lastBB = unrolled->exitBB(); + if (AddWIMetadata) + unrolled->AddIDMetadata(F.getContext(), c); + } + unrolled = true; + l = std::make_pair(original->entryBB(), lastBB); + } else { + l = std::make_pair(original->entryBB(), original->exitBB()); + } +#else + l = std::make_pair(original->entryBB(), original->exitBB()); +#endif + } + + l = CreateLoopAround(*original, l.first, l.second, peelFirst, localIdX, + LocalSizeX, true, lsizeX); + if (maxDim > 1) + l = CreateLoopAround(*original, l.first, l.second, false, localIdY, + LocalSizeY, true, lsizeY); + if (maxDim > 2) + l = CreateLoopAround(*original, l.first, l.second, false, localIdZ, + LocalSizeZ, true, lsizeZ); + + /* Loop edges coming from another region mean B-loops which means + we have to fix the loop edge to jump to the beginning of the wi-loop + structure, not its body. This has to be done only for non-peeled + blocks as the semantics is correct in the other case (the jump is + to the beginning of the peeled iteration). */ + if (!peelFirst) + { + for (BasicBlockVector::iterator i = preds.begin(); + i != preds.end(); ++i) + { + llvm::BasicBlock *bb = *i; + bb->getTerminator()->replaceUsesOfWith + (original->entryBB(), l.first); + } + } + } + + // for the peeled regions we need to add a prologue + // that initializes the local ids and the first iteration + // counter + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *pr = (*i); + + if (!peeledRegion[pr]) continue; + pr->insertPrologue(0, 0, 0); + builder.SetInsertPoint(pr->entryBB()->getFirstInsertionPt()); + builder.CreateStore + (ConstantInt::get(IntegerType::get(F.getContext(), size_t_width), 1), + localIdXFirstVar); + } + + // Creating lsize* values have been hoisted up + // K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ); + llvm::Instruction *inspt = F.getEntryBlock().getFirstNonPHI(); + inspt->getParent()->getInstList().insert(inspt, lsizeX); + if (maxDim > 1) + inspt->getParent()->getInstList().insert(inspt, lsizeY); + if (maxDim > 2) + inspt->getParent()->getInstList().insert(inspt, lsizeZ); + // llvm::GlobalVariable *gvx = M->getGlobalVariable("_local_size_x"); + // llvm::GlobalVariable *gvy = M->getGlobalVariable("_local_size_y"); + // llvm::GlobalVariable *gvz = M->getGlobalVariable("_local_size_z"); + // llvm::Instruction *storex = new StoreInst(lsizeX, gvx, inspt); + // llvm::Instruction *storey = new StoreInst(lsizeY, gvy, inspt); + // llvm::Instruction *storez = new StoreInst(lsizeZ, gvz, inspt); + + + ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0); + +#if 0 + F.viewCFG(); +#endif + + return true; +} + +/* + * Add context save/restore code to variables that are defined in + * the given region and are used outside the region. + * + * Each such variable gets a slot in the stack frame. The variable + * is restored from the stack whenever it's used. + * + */ +void +WorkitemLoops::FixMultiRegionVariables(ParallelRegion *region) +{ + InstructionIndex instructionsInRegion; + InstructionVec instructionsToFix; + + /* Construct an index of the region's instructions so it's + fast to figure out if the variable uses are all + in the region. */ + for (BasicBlockVector::iterator i = region->begin(); + i != region->end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instr = bb->begin(); + instr != bb->end(); ++instr) + { + llvm::Instruction *instruction = instr; + instructionsInRegion.insert(instruction); + } + } + + /* Find all the instructions that define new values and + check if they need to be context saved. */ + for (BasicBlockVector::iterator i = region->begin(); + i != region->end(); ++i) + { + llvm::BasicBlock *bb = *i; + for (llvm::BasicBlock::iterator instr = bb->begin(); + instr != bb->end(); ++instr) + { + llvm::Instruction *instruction = instr; + + if (ShouldNotBeContextSaved(instr)) continue; + + for (Instruction::use_iterator ui = instruction->use_begin(), + ue = instruction->use_end(); + ui != ue; ++ui) + { + Instruction *user; + if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue; + // if the instruction is used outside this region inside another + // region (not in a regionless BB like the B-loop construct BBs), + // need to context save it. + if (instructionsInRegion.find(user) == instructionsInRegion.end() && + RegionOfBlock(user->getParent()) != NULL) + { + instructionsToFix.push_back(instruction); + break; + } + } + } + } + + /* Finally, fix the instructions. */ + for (InstructionVec::iterator i = instructionsToFix.begin(); + i != instructionsToFix.end(); ++i) + { +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### adding context/save restore for" << std::endl; + (*i)->dump(); +#endif + llvm::Instruction *instructionToFix = *i; + AddContextSaveRestore(instructionToFix); + } +} + +llvm::Instruction * +WorkitemLoops::AddContextSave +(llvm::Instruction *instruction, llvm::Instruction *alloca) +{ + + if (isa<AllocaInst>(instruction)) + { + /* If the variable to be context saved is itself an alloca, + we have created one big alloca that stores the data of all the + work-items and return pointers to that array. Thus, we need + no initialization code other than the context data alloca itself. */ + return NULL; + } + + /* Save the produced variable to the array. */ + BasicBlock::iterator definition = dyn_cast<Instruction>(instruction); + + ++definition; + while (isa<PHINode>(definition)) ++definition; + + IRBuilder<> builder(definition); + std::vector<llvm::Value *> gepArgs; + + /* Reuse the id loads earlier in the region, if possible, to + avoid messy output with lots of redundant loads. */ + ParallelRegion *region = RegionOfBlock(instruction->getParent()); + assert ("Adding context save outside any region produces illegal code." && + region != NULL); + +// linearize index computation for store into alloca +// alloca[idz * sizey*sizex + idy * sizex + idx] + llvm::Value *linear_index = region->LocalIDXLoad(); + if (maxDim > 1) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDYLoad(), + lsizeX) ); + if (maxDim > 2) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDZLoad(), + builder.CreateMul(lsizeY, lsizeX)) ); + gepArgs.push_back(linear_index); + + return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs)); + +} + +llvm::Instruction * +WorkitemLoops::AddContextRestore +(llvm::Value *val, llvm::Instruction *alloca, llvm::Instruction *before, + bool isAlloca) +{ + assert (val != NULL); + IRBuilder<> builder(alloca); + if (before != NULL) + { + builder.SetInsertPoint(before); + } + else if (isa<Instruction>(val)) + { + builder.SetInsertPoint(dyn_cast<Instruction>(val)); + before = dyn_cast<Instruction>(val); + } + else + { + assert (false && "Unknown context restore location!"); + } + + + std::vector<llvm::Value *> gepArgs; + + /* Reuse the id loads earlier in the region, if possible, to + avoid messy output with lots of redundant loads. */ + ParallelRegion *region = RegionOfBlock(before->getParent()); + assert ("Adding context save outside any region produces illegal code." && + region != NULL); + +// linearize alloca loads +// idz * _local_size_x * _local_size_y + idy * _local_size_x + idx + llvm::Value *linear_index = region->LocalIDXLoad(); + if (maxDim > 1) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDYLoad(), + lsizeX) ); + if (maxDim > 2) + linear_index = builder.CreateAdd(linear_index, + builder.CreateMul(region->LocalIDZLoad(), + builder.CreateMul(lsizeY, lsizeX)) ); + gepArgs.push_back(linear_index); + + llvm::Instruction *gep = + dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs)); + + if (isAlloca) { + /* In case the context saved instruction was an alloca, we created a + context array with pointed-to elements, and now want to return a pointer + to the elements to emulate the original alloca. */ + return gep; + } + return builder.CreateLoad(gep); +} + +/** + * Returns the context array (alloca) for the given Value, creates it if not + * found. + */ +llvm::Instruction * +WorkitemLoops::GetContextArray(llvm::Instruction *instruction) +{ + + /* + * Unnamed temp instructions need a generated name for the + * context array. Create one using a running integer. + */ + std::ostringstream var; + var << "."; + + if (std::string(instruction->getName().str()) != "") + { + var << instruction->getName().str(); + } + else if (tempInstructionIds.find(instruction) != tempInstructionIds.end()) + { + var << tempInstructionIds[instruction]; + } + else + { + tempInstructionIds[instruction] = tempInstructionIndex++; + var << tempInstructionIds[instruction]; + } + + var << ".pocl_context"; + std::string varName = var.str(); + + if (contextArrays.find(varName) != contextArrays.end()) + return contextArrays[varName]; + + IRBuilder<> builder(instruction->getParent()->getParent()->getEntryBlock().getFirstInsertionPt()); + + llvm::Type *elementType; + if (isa<AllocaInst>(instruction)) + { + /* If the variable to be context saved was itself an alloca, + create one big alloca that stores the data of all the + work-items and directly return pointers to that array. + This enables moving all the allocas to the entry node without + breaking the parallel loop. + Otherwise we would rely on a dynamic alloca to allocate + unique stack space to all the work-items when its wiloop + iteration is executed. */ + elementType = + dyn_cast<AllocaInst>(instruction)->getType()->getElementType(); + } + else + { + elementType = instruction->getType(); + } + +// parameterize alloca to be based on _local_size_{x,y,z} + llvm::Value *wgsize = lsizeX; + if (maxDim > 1) wgsize = builder.CreateMul(wgsize, lsizeY); + if (maxDim > 2) wgsize = builder.CreateMul(wgsize, lsizeZ); + llvm::Type *contextArrayType = ArrayType::get(elementType, 1); + llvm::Instruction *alloca = + builder.CreateAlloca(elementType, wgsize, varName); + + contextArrays[varName] = alloca; + return alloca; +} + + +/** + * Adds context save/restore code for the value produced by the + * given instruction. + * + * TODO: add only one restore per variable per region. + * TODO: add only one load of the id variables per region. + * Could be done by having a context restore BB in the beginning of the + * region and a context save BB at the end. + * TODO: ignore work group variables completely (the iteration variables) + * The LLVM should optimize these away but it would improve + * the readability of the output during debugging. + * TODO: rematerialize some values such as extended values of global + * variables (especially global id which is computed from local id) or kernel + * argument values instead of allocating stack space for them + */ +void +WorkitemLoops::AddContextSaveRestore +(llvm::Instruction *instruction) { + + /* Allocate the context data array for the variable. */ + llvm::Instruction *alloca = GetContextArray(instruction); + llvm::Instruction *theStore = AddContextSave(instruction, alloca); + + InstructionVec uses; + /* Restore the produced variable before each use to ensure the correct context + copy is used. + + We could add the restore only to other regions outside the + variable defining region and use the original variable in the defining + region due to the SSA virtual registers being unique. However, + alloca variables can be redefined also in the same region, thus we + need to ensure the correct alloca context position is written, not + the original unreplicated one. These variables can be generated by + volatile variables, private arrays, and due to the PHIs to allocas + pass. + */ + + /* Find out the uses to fix first as fixing them invalidates + the iterator. */ + for (Instruction::use_iterator ui = instruction->use_begin(), + ue = instruction->use_end(); + ui != ue; ++ui) + { + Instruction *user; + if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue; + if (user == theStore) continue; + uses.push_back(user); + } + + for (InstructionVec::iterator i = uses.begin(); i != uses.end(); ++i) + { + Instruction *user = *i; + Instruction *contextRestoreLocation = user; + /* If the user is in a block that doesn't belong to a region, + the variable itself must be a "work group variable", that is, + not dependent on the work item. Most likely an iteration + variable of a for loop with a barrier. */ + if (RegionOfBlock(user->getParent()) == NULL) continue; + + PHINode* phi = dyn_cast<PHINode>(user); + if (phi != NULL) + { + /* In case of PHI nodes, we cannot just insert the context + restore code before it in the same basic block because it is + assumed there are no non-phi Instructions before PHIs which + the context restore code constitutes to. Add the context + restore to the incomingBB instead. + + There can be values in the PHINode that are incoming + from another region even though the decision BB is within the region. + For those values we need to add the context restore code in the + incoming BB (which is known to be inside the region due to the + assumption of not having to touch PHI nodes in PRentry BBs). + */ + + /* PHINodes at region entries are broken down earlier. */ + assert ("Cannot add context restore for a PHI node at the region entry!" && + RegionOfBlock(phi->getParent())->entryBB() != phi->getParent()); +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### adding context restore code before PHI" << std::endl; + user->dump(); + std::cerr << "### in BB:" << std::endl; + user->getParent()->dump(); +#endif + BasicBlock *incomingBB = NULL; + for (unsigned incoming = 0; incoming < phi->getNumIncomingValues(); + ++incoming) + { + Value *val = phi->getIncomingValue(incoming); + BasicBlock *bb = phi->getIncomingBlock(incoming); + if (val == instruction) incomingBB = bb; + } + assert (incomingBB != NULL); + contextRestoreLocation = incomingBB->getTerminator(); + } + llvm::Value *loadedValue = + AddContextRestore + (user, alloca, contextRestoreLocation, isa<AllocaInst>(instruction)); + user->replaceUsesOfWith(instruction, loadedValue); +#ifdef DEBUG_WORK_ITEM_LOOPS + std::cerr << "### done, the user was converted to:" << std::endl; + user->dump(); +#endif + } +} + +bool +WorkitemLoops::ShouldNotBeContextSaved(llvm::Instruction *instr) +{ + /* + _local_id loads should not be replicated as it leads to + problems in conditional branch case where the header node + of the region is shared across the branches and thus the + header node's ID loads might get context saved which leads + to egg-chicken problems. + */ + llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr); + if (load != NULL && + (load->getPointerOperand() == localIdZ || + load->getPointerOperand() == localIdY || + load->getPointerOperand() == localIdX)) + return true; + return false; +} + +llvm::BasicBlock * +WorkitemLoops::AppendIncBlock +(llvm::BasicBlock* after, llvm::Value *localIdVar) +{ + llvm::LLVMContext &C = after->getContext(); + + llvm::BasicBlock *oldExit = after->getTerminator()->getSuccessor(0); + assert (oldExit != NULL); + + llvm::BasicBlock *forIncBB = + BasicBlock::Create(C, "pregion.for.inc", after->getParent()); + + after->getTerminator()->replaceUsesOfWith(oldExit, forIncBB); + + IRBuilder<> builder(oldExit); + + builder.SetInsertPoint(forIncBB); + /* Create the iteration variable increment */ + builder.CreateStore + (builder.CreateAdd + (builder.CreateLoad(localIdVar), + ConstantInt::get(IntegerType::get(C, size_t_width), 1)), + localIdVar); + + builder.CreateBr(oldExit); + + return forIncBB; +} diff --git a/src/llvmopencl/WorkitemLoops.h b/src/llvmopencl/WorkitemLoops.h new file mode 100644 index 0000000..aac4cfa --- /dev/null +++ b/src/llvmopencl/WorkitemLoops.h @@ -0,0 +1,112 @@ +// Header for WorkitemLoops function pass. +// +// Copyright (c) 2012 Pekka Jääskeläinen / TUT +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_WORKITEM_LOOPS_H +#define _POCL_WORKITEM_LOOPS_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <map> +#include <vector> +#include "WorkitemHandler.h" +#include "ParallelRegion.h" + +#define MAX_DIMENSIONS 3u + +namespace llvm { + class PostDominatorTree; +} + +namespace pocl { + class Workgroup; + + class WorkitemLoops : public pocl::WorkitemHandler { + + public: + static char ID; + + WorkitemLoops() : pocl::WorkitemHandler(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnFunction(llvm::Function &F); + + private: + + typedef std::vector<llvm::BasicBlock *> BasicBlockVector; + typedef std::set<llvm::Instruction* > InstructionIndex; + typedef std::vector<llvm::Instruction* > InstructionVec; + typedef std::map<std::string, llvm::Instruction*> StrInstructionMap; + + InstructionIndex workGroupVariables; + + llvm::DominatorTree *DT; + llvm::LoopInfo *LI; + llvm::PostDominatorTree *PDT; + + ParallelRegion::ParallelRegionVector *original_parallel_regions; + + StrInstructionMap contextArrays; + + virtual bool ProcessFunction(llvm::Function &F); + + void FixMultiRegionVariables(ParallelRegion *region); + void AddContextSaveRestore(llvm::Instruction *instruction); + + llvm::Instruction *AddContextSave(llvm::Instruction *instruction, llvm::Instruction *alloca); + llvm::Instruction *AddContextRestore + (llvm::Value *val, llvm::Instruction *alloca, + llvm::Instruction *before=NULL, + bool isAlloca=false); + llvm::Instruction *GetContextArray(llvm::Instruction *val); + + std::pair<llvm::BasicBlock *, llvm::BasicBlock *> + CreateLoopAround + (ParallelRegion ®ion, llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB, + bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim, + bool addIncBlock=true, llvm::Instruction *lsizeDim=NULL); + void FindKernelDim(llvm::Function &F); + + llvm::BasicBlock * + AppendIncBlock + (llvm::BasicBlock* after, + llvm::Value *localIdVar); + + ParallelRegion* RegionOfBlock(llvm::BasicBlock *bb); + + bool ShouldNotBeContextSaved(llvm::Instruction *instr); + + std::map<llvm::Instruction*, unsigned> tempInstructionIds; + size_t tempInstructionIndex; + // An alloca in the kernel which stores the first iteration to execute + // in the inner (dimension 0) loop. This is set to 1 in an peeled iteration + // to skip the 0, 0, 0 iteration in the loops. + llvm::Value *localIdXFirstVar; + + unsigned int maxDim; + llvm::Instruction *lsizeX, *lsizeY, *lsizeZ; + }; +} + +#endif diff --git a/src/llvmopencl/WorkitemReplication.cc b/src/llvmopencl/WorkitemReplication.cc new file mode 100644 index 0000000..b6ea3cd --- /dev/null +++ b/src/llvmopencl/WorkitemReplication.cc @@ -0,0 +1,308 @@ +// LLVM function pass to replicate the kernel body for all work items +// in a work group. +// +// Copyright (c) 2011-2012 Carlos Sánchez de La Lama / URJC and +// Pekka Jääskeläinen / TUT +// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define DEBUG_TYPE "workitem" + +#include "WorkitemReplication.h" +#include "Workgroup.h" +#include "Barrier.h" +#include "Kernel.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/CommandLine.h" +#include "config.h" +#ifdef LLVM_3_1 +#include "llvm/Support/IRBuilder.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/ValueSymbolTable.h" +#elif defined LLVM_3_2 +#include "llvm/IRBuilder.h" +#include "llvm/DataLayout.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/ValueSymbolTable.h" +#else +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#endif +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "WorkitemHandlerChooser.h" + +#include <iostream> +#include <map> +#include <sstream> +#include <vector> + +//#define DEBUG_BB_MERGING +//#define DUMP_RESULT_CFG +//#define DEBUG_PR_REPLICATION + +#ifdef DUMP_RESULT_CFG +#include "llvm/Analysis/CFGPrinter.h" +#endif + +using namespace llvm; +using namespace pocl; + +STATISTIC(ContextValues, "Number of SSA values which have to be context-saved"); +STATISTIC(ContextSize, "Context size per workitem in bytes"); + +namespace { + static + RegisterPass<WorkitemReplication> X("workitemrepl", "Workitem replication pass"); +} + +char WorkitemReplication::ID = 0; + +void +WorkitemReplication::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + +// TODO - removed due to compilation error +#if 0 +#ifdef LLVM_3_1 + AU.addRequired<TargetData>(); +#else + AU.addRequired<DataLayout>(); +#endif +#endif + AU.addRequired<pocl::WorkitemHandlerChooser>(); +} + +bool +WorkitemReplication::runOnFunction(Function &F) +{ + if (!Workgroup::isKernelToProcess(F)) + return false; + + if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() != + pocl::WorkitemHandlerChooser::POCL_WIH_FULL_REPLICATION) + return false; + + DT = &getAnalysis<DominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + + bool changed = ProcessFunction(F); +#ifdef DUMP_RESULT_CFG + FunctionPass* cfgPrinter = createCFGPrinterPass(); + cfgPrinter->runOnFunction(F); +#endif + + changed |= fixUndominatedVariableUses(DT, F); + return changed; +} + +bool +WorkitemReplication::ProcessFunction(Function &F) +{ + Module *M = F.getParent(); + +// F.viewCFG(); + + Kernel *K = cast<Kernel> (&F); + Initialize(K); + + // Allocate space for workitem reference maps. Workitem 0 does + // not need it. + unsigned workitem_count = LocalSizeZ * LocalSizeY * LocalSizeX; + + BasicBlockVector original_bbs; + for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) { + if (!Barrier::hasBarrier(i)) + original_bbs.push_back(i); + } + + ParallelRegion::ParallelRegionVector* original_parallel_regions = + K->getParallelRegions(LI); + + std::vector<SmallVector<ParallelRegion *, 8> > parallel_regions(workitem_count); + + parallel_regions[0] = *original_parallel_regions; + + /* Enable to get region identification printouts */ +#if 0 + for (ParallelRegion::ParallelRegionVector::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) + { + ParallelRegion *region = (*i); + region->InjectRegionPrintF(); + region->InjectVariablePrintouts(); + } +#endif + + // Measure the required context (variables alive in more than one region). +#ifdef LLVM_3_1 + TargetData &TD = getAnalysis<TargetData>(); +#else + DataLayout &TD = getAnalysis<DataLayout>(); +#endif + + for (SmallVector<ParallelRegion *, 8>::iterator + i = original_parallel_regions->begin(), e = original_parallel_regions->end(); + i != e; ++i) { + ParallelRegion *pr = (*i); + + for (ParallelRegion::iterator i2 = pr->begin(), e2 = pr->end(); + i2 != e2; ++i2) { + BasicBlock *bb = (*i2); + + for (BasicBlock::iterator i3 = bb->begin(), e3 = bb->end(); + i3 != e3; ++i3) { + for (Value::use_iterator i4 = i3->use_begin(), e4 = i3->use_end(); + i4 != e4; ++i4) { + // Instructions can only be used by instructions. + Instruction *user = cast<Instruction> (*i4); + + if (find (pr->begin(), pr->end(), user->getParent()) == + pr->end()) { + // User is not in the defining region. + ++ContextValues; + ContextSize += TD.getTypeAllocSize(i3->getType()); + break; + } + } + } + } + } + + // Then replicate the ParallelRegions. + ValueToValueMapTy *const reference_map = new ValueToValueMapTy[workitem_count - 1]; + for (int z = 0; z < LocalSizeZ; ++z) { + for (int y = 0; y < LocalSizeY; ++y) { + for (int x = 0; x < LocalSizeX ; ++x) { + + int index = + (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x); + + if (index == 0) + continue; + + for (SmallVector<ParallelRegion *, 8>::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) { + ParallelRegion *original = (*i); + ParallelRegion *replicated = + original->replicate + (reference_map[index - 1], + (".wi_" + Twine(x) + "_" + Twine(y) + "_" + Twine(z))); + if (AddWIMetadata) + replicated->AddIDMetadata(M->getContext(), x, y, z); + parallel_regions[index].push_back(replicated); +#ifdef DEBUG_PR_REPLICATION + std::cerr << "### new replica:" << std::endl; + replicated->dump(); +#endif + } + } + } + } + if (AddWIMetadata) { + for (SmallVector<ParallelRegion *, 8>::iterator + i = original_parallel_regions->begin(), + e = original_parallel_regions->end(); + i != e; ++i) { + ParallelRegion *original = (*i); + original->AddIDMetadata(M->getContext(), 0, 0, 0); + } + } + + for (int z = 0; z < LocalSizeZ; ++z) { + for (int y = 0; y < LocalSizeY; ++y) { + for (int x = 0; x < LocalSizeX ; ++x) { + + int index = + (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x); + + for (unsigned i = 0, e = parallel_regions[index].size(); i != e; ++i) { + ParallelRegion *region = parallel_regions[index][i]; + if (index != 0) { + region->remap(reference_map[index - 1]); + region->chainAfter(parallel_regions[index - 1][i]); + region->purge(); + } + region->insertPrologue(x, y, z); + } + } + } + } + + // Try to merge all workitem first block of each region + // together (for PHI predecessor correctness). + for (int z = LocalSizeZ - 1; z >= 0; --z) { + for (int y = LocalSizeY - 1; y >= 0; --y) { + for (int x = LocalSizeX - 1; x >= 0; --x) { + + int index = + (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x); + + if (index == 0) + continue; + + for (unsigned i = 0, e = parallel_regions[index].size(); i != e; ++i) { + ParallelRegion *region = parallel_regions[index][i]; + BasicBlock *entry = region->entryBB(); + + assert (entry != NULL); + BasicBlock *pred = entry->getUniquePredecessor(); + assert (pred != NULL && "No unique predecessor."); +#ifdef DEBUG_BB_MERGING + std::cerr << "### pred before merge into predecessor " << std::endl; + pred->dump(); + std::cerr << "### entry before merge into predecessor " << std::endl; + entry->dump(); +#endif + movePhiNodes(entry, pred); + } + } + } + } + + // Add the suffixes to original (wi_0_0_0) basic blocks. + for (BasicBlockVector::iterator i = original_bbs.begin(), + e = original_bbs.end(); + i != e; ++i) + (*i)->setName((*i)->getName() + ".wi_0_0_0"); + + // Initialize local size variables (done at the end to avoid unnecessary + // replication). + K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ); + + delete [] reference_map; + +// F.viewCFG(); + + return true; +} + diff --git a/src/llvmopencl/WorkitemReplication.h b/src/llvmopencl/WorkitemReplication.h new file mode 100644 index 0000000..fb5d9d4 --- /dev/null +++ b/src/llvmopencl/WorkitemReplication.h @@ -0,0 +1,62 @@ +// Header for WorkitemReplication function pass. +// +// Copyright (c) 2011 Universidad Rey Juan Carlos and +// 2012 Pekka Jääskeläinen / TUT +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef _POCL_WORKITEM_REPLICATION_H +#define _POCL_WORKITEM_REPLICATION_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <map> +#include <vector> +#include "WorkitemHandler.h" + +namespace pocl { + class Workgroup; + + class WorkitemReplication : public pocl::WorkitemHandler { + + public: + static char ID; + + WorkitemReplication() : pocl::WorkitemHandler(ID) {} + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const; + virtual bool runOnFunction(llvm::Function &F); + + + private: + + llvm::DominatorTree *DT; + llvm::LoopInfo *LI; + + typedef std::set<llvm::BasicBlock *> BasicBlockSet; + typedef std::vector<llvm::BasicBlock *> BasicBlockVector; + typedef std::map<llvm::Value *, llvm::Value *> ValueValueMap; + + virtual bool ProcessFunction(llvm::Function &F); + }; +} + +#endif diff --git a/src/llvmopencl/config.h b/src/llvmopencl/config.h new file mode 100644 index 0000000..1f1ed9d --- /dev/null +++ b/src/llvmopencl/config.h @@ -0,0 +1 @@ +// Empty on purpose. Satifies includes from other files. diff --git a/src/llvmopencl/pocl.h b/src/llvmopencl/pocl.h new file mode 100644 index 0000000..ae6a66d --- /dev/null +++ b/src/llvmopencl/pocl.h @@ -0,0 +1,49 @@ +/* pocl.h - global pocl declarations. + + Copyright (c) 2011 Universidad Rey Juan Carlos + 2011-2014 Pekka Jääskeläinen / Tampere University of Technology + Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/** + * @file pocl.h + * + * The declarations in this file are such that are used both in the + * libpocl implementation CL and the kernel compiler. Others should be + * moved to pocl_cl.h of lib/CL or under the kernel compiler dir. + * @todo Check if there are extra declarations here that could be moved. + */ +#ifndef POCL_H +#define POCL_H + +/* + * During pocl kernel compiler transformations we use the fixed address + * space ids of clang's -ffake-address-space-map to mark the different + * address spaces to keep the processing target-independent. These + * are converted to the target's address space map (if any), in a final + * kernel compiler pass. + */ +#define POCL_ADDRESS_SPACE_PRIVATE 0 +#define POCL_ADDRESS_SPACE_GLOBAL 1 +#define POCL_ADDRESS_SPACE_LOCAL 2 +#define POCL_ADDRESS_SPACE_CONSTANT 3 + +#endif /* POCL_H */ diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt new file mode 100644 index 0000000..f3d34ab --- /dev/null +++ b/src/runtime/CMakeLists.txt @@ -0,0 +1,59 @@ +# If building for ARM target host then set appropriate clang target +# Needs to match what's used when using clang to build the kernel +# See compiler.cpp +if (HAWKING_BUILD) + set(HOST_TARGET -target spir-unknown-unknown-unknown) +endif() + +# If Shamrock build, then we use the builtins.lib built in ../builtins +if (SHAMROCK_BUILD) +add_custom_command( + OUTPUT stdlib.c.bc.embed.h + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/embed.py + ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h + ${CMAKE_CURRENT_BINARY_DIR}/../builtins/builtins.lib + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/embed.py + ${CMAKE_CURRENT_BINARY_DIR}/../builtins/builtins.lib) + +add_custom_target(generate_stdlib_c DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h) +# otherwise, this stdlib.c is still being used (but is empty) +else (SHAMROCK_BUILD) + set(CUSTOM_COMMAND + ${CLANG_EXECUTABLE} -c -emit-llvm -x cl -O2 ${HOST_TARGET} -nostdinc -fno-builtin) + +add_custom_command( + OUTPUT stdlib.c.bc + COMMAND ${CUSTOM_COMMAND} + -I${OCL_BUILTINS_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/stdlib.c + -I${CMAKE_CURRENT_BINARY_DIR} + -o ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/stdlib.c + ${CMAKE_CURRENT_BINARY_DIR}/stdlib_impl.h) + +add_custom_command( + OUTPUT stdlib.c.bc.embed.h + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/embed.py + ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h + ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/embed.py + ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc) + +add_custom_target(generate_stdlib_c DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h) + +add_custom_command( + OUTPUT builtins_def.h stdlib_def.h builtins_impl.h stdlib_impl.h + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/builtins.py + ${CMAKE_CURRENT_SOURCE_DIR}/builtins.def + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/builtins.py + ${CMAKE_CURRENT_SOURCE_DIR}/builtins.def) + +add_custom_target(generate_builtins DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/builtins_def.h + ${CMAKE_CURRENT_BINARY_DIR}/builtins_impl.h + ${CMAKE_CURRENT_BINARY_DIR}/stdlib_def.h + ${CMAKE_CURRENT_BINARY_DIR}/stdlib_impl.h) +endif(SHAMROCK_BUILD) diff --git a/src/runtime/builtins.def b/src/runtime/builtins.def new file mode 100644 index 0000000..b94807b --- /dev/null +++ b/src/runtime/builtins.def @@ -0,0 +1,301 @@ +def vecf : float2 float3 float4 float8 float16 +def veci : int2 int3 int4 int8 int16 + +def vec : $vecf $veci +def gentype : float $vecf + +// gentype acos(gentype) +// REPL is defined in src/core/cpu/builtins.cpp +//native float acos float : x:float + //return std::acos(x); +//end + +//native $type acos $vecf : x:$type + //REPL($vecdim) + //result[i] = std::acos(x[i]); +//end + +// gentype acosh(gentype) +//native float acosh float : x:float + //return boost::math::acosh(x); +//end + +//native $type acosh $vecf : x:$type + //REPL($vecdim) + //result[i] = boost::math::acosh(x[i]); +//end + +// gentype acospi(gentype) +//func float acospi float : x:float + //return acos(x) / M_PI; +//end + +//native $type acospi $vecf : x:$type + //REPL($vecdim) + //result[i] = std::acos(x[i]) / M_PI; +//end + +// gentype asin (gentype) +//native float asin float : x:float + //return std::asin(x); +//end + +//native $type asin $vecf : x:$type + //REPL($vecdim) + //result[i] = std::asin(x[i]); +//end + +// gentype asinh (gentype) +//native float asinh float : x:float + //return boost::math::asinh(x); +//end + +//native $type asinh $vecf : x:$type + //REPL($vecdim) + //result[i] = boost::math::asinh(x[i]); +//end + +// gentype asinpi (gentype x) +//func float asinpi float : x:float + //return asin(x) / M_PI; +//end + +//native $type asinpi $vecf : x:$type + //REPL($vecdim) + //result[i] = std::asin(x[i]) / M_PI; +//end + +// gentype atan (gentype y_over_x) +//native float atan float : y_over_x:float + //return std::atan(y_over_x); +//end + +//native $type atan $vecf : y_over_x:$type + //REPL($vecdim) + //result[i] = std::atan(y_over_x[i]); +//end + +// gentype atan2 (gentype y, gentype x) +//func float atan2 float : x:float y:float + //return atan(y / x); +//end + +//native $type atan2 $vecf : x:$type y:$type + //REPL($vecdim) + //result[i] = std::atan(y[i] / x[i]); +//end + +// gentype atanh (gentype) +//native float atanh float : x:float + //return boost::math::atanh(x); +//end + +//native $type atanh $vecf : x:$type + //REPL($vecdim) + //result[i] = boost::math::atanh(x[i]); +//end + +// gentype atanpi (gentype x) +//func float atanpi float : x:float + //return atan(x) / M_PI; +//end + +//native $type atanpi $vecf : x:$type + //REPL($vecdim) + //result[i] = std::atan(x[i]) / M_PI; +//end + +// gentype atan2pi (gentype y, gentype x) +//func float atan2pi float : x:float y:float + //return atan2(y, x) / M_PI; +//end +// +//native $type atan2pi $vecf : x:$type y:$type + //REPL($vecdim) + //result[i] = std::atan(y[i] / x[i]) / M_PI; +//end + +// gentype cbrt (gentype) +//native float cbrt float : x:float + //return boost::math::cbrt(x); +//end +// +//native $type cbrt $vecf : x:$type + //REPL($vecdim) + //result[i] = boost::math::cbrt(x[i]); +//end + +// gentype ceil (gentype) +//native float ceil float : x:float + //return std::ceil(x); +//end +// +//native $type ceil $vecf : x:$type + //REPL($vecdim) + //result[i] = std::ceil(x[i]); +//end + +// gentype copysign (gentype x, gentype y) +//func $type copysign $gentype : x:$type y:$type + //return ( + //(x < 0.0f & y > 0.0f) | + //(x > 0.0f & y < 0.0f) + //? -x : x); +//end + +//gentype cos (gentype) +//native float cos float : x:float + //return std::cos(x); +//end + +//native $type cos $vecf : x:$type + //REPL($vecdim) + //result[i] = std::cos(x[i]); +//end + +// gentype cosh (gentype) +//native float cosh float : x:float + //return std::cosh(x); +//end + +//native $type cosh $vecf : x:$type + //REPL($vecdim) + //result[i] = std::cosh(x[i]); +//end + +// gentype cospi (gentype x) +//func $type cospi $gentype : x:$type + //return cos(x * (float)M_PI); +//end + +// TODO: gentype erfc (gentype) +// TODO: gentype erf (gentype) + +// gentype exp(gentype x) +//native float exp float : x:float + //return std::exp(x); +//end +// +//native $type exp $vecf : x:$type + //REPL($vecdim) + //result[i] = std::exp(x[i]); +//end +// +// gentype exp2(gentype x) +//native float exp2 float : x:float + //return exp2f(x); +//end +// +//native $type exp2 $vecf : x:$type + //REPL($vecdim) + //result[i] = exp2f(x[i]); +//end +// +//// gentype exp10(gentype x) +//native float exp10 float : x:float + //return exp10f(x); +//end +// +//native $type exp10 $vecf : x:$type + //REPL($vecdim) + //result[i] = exp10f(x[i]); +//end +// +//// gentype expm1(gentype x) +//func $type expm1 $gentype : x:$type + //return exp(x) - 1.0f; +//end +// +//// gentype fdim(x, y) +//func $type fdim $gentype : x:$type y:$type + //return (x > y ? x - y : 0.0f); +//end +// +// gentype floor(gentype x) (TODO: SSE fast path : float->int->float) +//native float floor float : x:float + //return std::floor(x); +//end +// +//native $type floor $vecf : x:$type + //REPL($vecdim) + //result[i] = std::floor(x[i]); +//end +// +//// gentype fma(a, b, c) : a*b + c (TODO) +//func $type fma $gentype : a:$type b:$type c:$type + //return (a * b) + c; +//end +// +//// gentype trunc(x) +//native float trunc float : x:float + //return boost::math::trunc(x); +//end +// +//native $type trunc $vecf : x:$type + //REPL($vecdim) + //result[i] = boost::math::trunc(x[i]); +//end +// +//// gentype fmod(x, y) +//func $type fmod $gentype : x:$type y:$type + //return x - y * trunc(x / y); +//end +// +// gentype fract(gentype x, gentype *iptr) +//func $type fract $gentype : x:$type iptr:*$type + //*iptr = floor(x); + //return fmin(x - *iptr, 0x1.fffffep-1f); +//end + +// gentype frexp(gentype x, intn *exp) +//native float frexp float : x:float exp:*int + //return std::frexp(x, exp); +//end +// +//native $type frexp $vecf : x:$type exp:*int$vecdim + //REPL($vecdim) + //result[i] = std::frexp(x[i], &exp[i]); +//end +// +//// gentype sqrt(gentype x) +//native float sqrt float : x:float + //return std::sqrt(x); +//end +// +//native double sqrt double : x:double + //return std::sqrt(x); +//end +// +//native double log double : x:double + //return std::log(x); +//end +// +//native $type sqrt $vecf : x:$type + //REPL($vecdim) + //result[i] = std::sqrt(x[i]); +//end +// +//// gentype hypot(gentype x, gentype y) +//func $type hypot $gentype : x:$type y:$type + //return sqrt(x*x + y*y); +//end + +// intn ilogb(gentype x) +//native int ilogb float : x:float + //return ilogb(x); +//end + +//native int$vecdim ilogb $vecf : x:$type + //REPL($vecdim) + //result[i] = ilogb(x[i]); +//end + +// gentype ldexp(gentype x, intn n) +//native float ldexp float : x:float n:int + //return std::ldexp(x, n); +//end + +//native $type ldexp $vecf : x:$type n:int$vecdim + //REPL($vecdim) + //result[i] = std::ldexp(x[i], n[i]); +//end diff --git a/src/runtime/builtins.py b/src/runtime/builtins.py new file mode 100755 index 0000000..909fee8 --- /dev/null +++ b/src/runtime/builtins.py @@ -0,0 +1,380 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> +# Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the copyright holder nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# builtins.py <def> <outdir> + +import sys + +class Function: + class Arg: + def __init__(self, name, t): + self.name = name + self.t = t + + KIND_BUILTINS_IMPL = 0 # static function in builtins.cpp + KIND_BUILTINS_DEF = 1 # if (name == '__cpu_$name') return (void *)&name; + KIND_STDLIB_IMPL = 2 # OpenCL C function in stdlib.c + KIND_STDLIB_DEF = 3 # Header in stdlib.h + KIND_STDLIB_STUB = 4 # OpenCL C stub in stdlib.c: calls __cpu_$name + KIND_STDLIB_STUB_DEF = 5 # __cpu_$name declared in stdlib.c + + def __init__(self, name, native): + self.name = name + self.native = native + + self.args = [] # Array <Arg> + self.types = [] # Array <str> + self.return_type = '' + self.body = '' + + def set_return_type(self, ty): + self.return_type = ty + + def append_body(self, body): + self.body += body + + def add_arg(self, name, ty): + self.args.append(self.Arg(name, ty)) + + def add_type(self, ty): + self.types.append(ty) + + def mangled_name(self, current_type): + return_type = self.process_type_name(current_type, self.return_type) + + rs = return_type + '_' + self.name + first = True + + for a in self.args: + if first: + rs += '_' + first = False + + arg_type = self.process_type_name(current_type, a.t) + rs += arg_type.replace('*', 'p') + + return rs + + def process_type_name(self, current_type, type_name): + # Current vector dimension + vecdim = '1' + + if current_type[-1].isdigit(): + if current_type[-2].isdigit(): + vecdim = current_type[-2:] + else: + vecdim = current_type[-1] + + # $vecdim expansion + return type_name.replace('$vecdim', vecdim).replace('$type', current_type) + + def arg_list(self, current_type, handle_first_arg): + rs = '' + first = True + append_arg = None + + # We may need a first "result" arg + if handle_first_arg: + return_type = self.process_type_name(current_type, self.return_type) + + if return_type[-1].isdigit(): + # Return is a vector + append_arg = self.Arg('result', return_type) + + if append_arg: + args = [append_arg] + self.args + else: + args = self.args + + for arg in args: + # Resolve type + arg_type = self.process_type_name(current_type, arg.t) + + if arg_type[0] == '*': + arg_ptr = True + arg_type = arg_type[1:] + else: + arg_ptr = False + + # We need to pass vector arguments as pointers + arg_vector = False + if handle_first_arg: + arg_vector = arg_type[-1].isdigit() + arg_type = arg_type.rstrip('0123456789') + + # Build the string + if not first: + rs += ', ' + first = False + + rs += arg_type + ' ' + + if arg_vector or arg_ptr: + rs += '*' + + rs += arg.name + + return rs + + def write(self, current_type, kind): + # Template: + # (static) $ret_type $name($args) { + # $body + # } + rs = '' + + if kind == self.KIND_BUILTINS_IMPL: + rs = 'static ' + elif kind == self.KIND_BUILTINS_DEF: + rs += ' else if (name == "__cpu_' + self.mangled_name(current_type) + '")\n' + rs += ' return (void *)&' + self.mangled_name(current_type) + ';\n' + return rs + + # Calculate return type + return_type = self.process_type_name(current_type, self.return_type) + + if (kind == self.KIND_BUILTINS_IMPL or kind == self.KIND_STDLIB_STUB_DEF) \ + and return_type[-1].isdigit(): + return_type = 'void' # We'll use a 'result' argument + + rs += return_type + ' ' + + # Append mangled name if needed + if kind == self.KIND_BUILTINS_IMPL: + rs += self.mangled_name(current_type) + elif kind == self.KIND_STDLIB_STUB_DEF: + rs += '__cpu_' + self.mangled_name(current_type) + else: + # No need to mangle the name, but add OVERLOAD + rs += '_CLC_OVERLOAD ' + self.name + + # Print function args + rs += '(' + rs += self.arg_list(current_type, kind == self.KIND_BUILTINS_IMPL or \ + kind == self.KIND_STDLIB_STUB_DEF) + rs += ')' + + # If only a declaration, end it + if kind == self.KIND_STDLIB_DEF or kind == self.KIND_STDLIB_STUB_DEF: + rs += ';\n' + return rs + + # Add the body + rs += '\n{\n' + + if kind == self.KIND_STDLIB_STUB: + # Special body : call __cpu_$name + return_is_vector = return_type[-1].isdigit() + if return_is_vector: + # Need to create a temporary + rs += ' ' + return_type + ' result;\n' + rs += '\n' + + # Call the cpu stub + rs += ' ' + if not return_is_vector: + rs += 'return ' + + rs += '__cpu_' + self.mangled_name(current_type) + '(' + + # Pass the result if needed + first = True + if return_is_vector: + rs += '(' + return_type.rstrip('0123456789') + ' *)&result' + first = False + + # Append the args + for arg in self.args: + # Resolve type + arg_type = self.process_type_name(current_type, arg.t) + + arg_ptr = False + if arg_type[0] == '*': + arg_type = arg_type[1:] + arg_ptr = True + + arg_vector = arg_type[-1].isdigit() + + if not first: + rs += ', ' + first = False + + # We need to pass vector arguments as pointers + if arg_vector: + rs += '(' + arg_type.rstrip('0123456789') + ' *)' + if not arg_ptr: + rs += '&' + + rs += arg.name + + # End the call + rs += ');\n' + + if return_is_vector: + rs += '\n return result;\n' + + rs += '}\n\n' + else: + # Simply copy the body + vecdim = '1' + + if current_type[-1].isdigit(): + if current_type[-2].isdigit(): + vecdim = current_type[-2:] + else: + vecdim = current_type[-1] + + rs += self.body.replace('$type', current_type) \ + .replace('$vecdim', vecdim) + rs += '\n}\n\n' + + return rs + +class Generator: + builtins_impl_file = 'builtins_impl.h' # static functions + builtins_def_file = 'builtins_def.h' # if () in getBuiltin + stdlib_impl_file = 'stdlib_impl.h' # stdlib.c functions + stdlib_def_file = 'stdlib_def.h' # stdlib.h definitions + + def __init__(self, out_path): + self.out_path = out_path + + # Buffers + self.builtins_impl_buffer = '' + self.builtins_def_buffer = '' + self.stdlib_impl_buffer = '' + self.stdlib_def_buffer = '' + + def add_function(self, function): + for t in function.types: + if function.native: + self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_STUB_DEF) + self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_STUB) + self.stdlib_def_buffer += function.write(t, function.KIND_STDLIB_DEF) + self.builtins_impl_buffer += function.write(t, function.KIND_BUILTINS_IMPL) + self.builtins_def_buffer += function.write(t, function.KIND_BUILTINS_DEF) + else: + self.stdlib_def_buffer += function.write(t, function.KIND_STDLIB_DEF) + self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_IMPL) + + def write(self): + of = open(self.out_path + '/' + self.stdlib_def_file, 'w') + of.write(self.stdlib_def_buffer) + of.close() + + of = open(self.out_path + '/' + self.stdlib_impl_file, 'w') + of.write(self.stdlib_impl_buffer) + of.close() + + of = open(self.out_path + '/' + self.builtins_def_file, 'w') + of.write(self.builtins_def_buffer) + of.close() + + of = open(self.out_path + '/' + self.builtins_impl_file, 'w') + of.write(self.builtins_impl_buffer) + of.close() + +class Parser: + def __init__(self, generator, def_file_name): + self.generator = generator + self.def_file_name = def_file_name + + self.defs = {} + + def replace_variable(self, token): + result = [] + + if token[0] == '$': + for tok in self.defs[token[1:]]: + result.extend(self.replace_variable(tok)) + else: + result.append(token) + + return result + + def parse(self): + def_file = open(self.def_file_name, 'rb') + current_function = None + + for line in def_file: + if current_function: + # End if we encounter an end + if line.startswith('end'): + self.generator.add_function(current_function) + current_function = None + else: + # Add a line to the body + current_function.append_body(line) + else: + line = line.strip() + tokens = line.split(' ') + tok = tokens[0] + + if tok == 'def': + # A definition : def <variable> : [values] + name = tokens[1] + values = [] + + for token in tokens[3:]: + values.extend(self.replace_variable(token)) + + self.defs[name] = values + elif tok == 'func' or tok == 'native': + # Function : func|native <ret_type> <name> [types] : [args] + current_function = Function(tokens[2], \ + tokens[0] == 'native') + + current_function.set_return_type(tokens[1]) + + # Explore the types and args + in_types = True + + for token in tokens[3:]: + if token == ':': + in_types = False + elif in_types: + for ty in self.replace_variable(token): + current_function.add_type(ty) + else: + # Parameters + parts = token.split(':') + current_function.add_arg(parts[0], parts[1]) + + def_file.close() + +if __name__ == '__main__': + def_file = sys.argv[1] + out_dir = sys.argv[2] + + gen = Generator(out_dir) + parser = Parser(gen, def_file) + + parser.parse() + gen.write() diff --git a/src/runtime/embed.py b/src/runtime/embed.py new file mode 100755 index 0000000..e3aca9d --- /dev/null +++ b/src/runtime/embed.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +# #!/usr/local/bin/python2.6-2.6.4 +# -*- coding: utf-8 -*- +# +# Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the copyright holder nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# embed.py <outfile> <filenames..> +# <filenames> => <outfile> + +import sys + +outfile = open(sys.argv[1], 'w') +name = sys.argv[1].split('/')[-1].replace('.embed.h', '').replace('.', '_') + +data = '' + +for i in xrange(len(sys.argv) - 1): + infile = open(sys.argv[i + 1], 'rb') + data += infile.read() + +# Header +outfile.write('#ifndef __%s__\n' % name.upper()) +outfile.write('#define __%s__\n' % name.upper()) +outfile.write('\n') +outfile.write('const char embed_%s[] =\n' % name) + +# Write it in chunks of 80 chars : +# | "\x00..." (4+1+1 + 4*chars ==> chars = 18) +index = 0 + +for c in data: + if index == 0: + outfile.write(' "') + + outfile.write('\\x%s' % ('%x' % ord(c)).rjust(2, '0')) + index += 1 + + if index == 18: + index = 0 + outfile.write('"\n') + +# We may need to terminate a line +if index != 0: + outfile.write('";\n') +else: + outfile.write(';\n') # Alone on its line, poor semicolon + +# Footer +outfile.write('\n') +outfile.write('#endif\n') + +infile.close() +outfile.close() diff --git a/src/runtime/stdlib.c b/src/runtime/stdlib.c new file mode 100644 index 0000000..9b115df --- /dev/null +++ b/src/runtime/stdlib.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +int debug(const char *format, ...); + +/* WARNING: Due to some device-specific things in stdlib.h, the bitcode stdlib + * must only be used by CPUDevice, as it's targeted to the host CPU at Clover's + * compilation! */ + +/* + * Built-in functions generated by src/runtime/builtins.py + */ + +#include <clc.h> +#include <stdlib_impl.h> |