/* * Copyright (c) 2011, Denis Steckelmacher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the copyright holder nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * \file cpu/builtins.cpp * \brief Native OpenCL C built-in functions * * All these built-ins are directly called by kernels. When the LLVM JIT * sees a function name it doesn't know, it calls \c getBuiltin() with this * name as parameter. This function then returns the address of an actual * function implementation, that finally gets called by the kernel when * it is run. */ #include "builtins.h" #include "kernel.h" #include "buffer.h" #include "../events.h" #include "../memobject.h" #include #include #include #include #include #include #include #include using namespace Coal; unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z, size_t row_pitch, size_t slice_pitch, unsigned int bytes_per_pixel) { unsigned char *result = base; result += (z * slice_pitch) + (y * row_pitch) + (x * bytes_per_pixel); return result; } /* * TLS-related functions */ __thread Coal::CPUKernelWorkGroup *g_work_group; /*!< \brief \c Coal::CPUKernelWorkGroup currently running on this thread */ __thread void *work_items_data; /*!< \brief Space allocated for work-items stacks, see \ref barrier */ __thread size_t work_items_size; /*!< \brief Size of \c work_items_data, see \ref barrier */ void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current) { g_work_group = current; } void *getWorkItemsData(size_t &size) { size = work_items_size; return work_items_data; } void setWorkItemsData(void *ptr, size_t size) { work_items_data = ptr; work_items_size = size; } /* * Actual built-ins implementations */ cl_uint CPUKernelWorkGroup::getWorkDim() const { return p_work_dim; } size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const { if (dimindx > p_work_dim) return 0; return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx]; } size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const { if (dimindx >p_work_dim) return 1; return p_event->global_work_size(dimindx); } size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const { if (dimindx > p_work_dim) return 1; return p_event->local_work_size(dimindx); } size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const { if (dimindx > p_work_dim) return 0; return p_current_context->local_id[dimindx]; } size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const { if (dimindx > p_work_dim) return 1; return (p_event->global_work_size(dimindx) / p_event->local_work_size(dimindx)); } size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const { if (dimindx > p_work_dim) return 0; return p_index[dimindx]; } size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const { if (dimindx > p_work_dim) return 0; return p_event->global_work_offset(dimindx); } void CPUKernelWorkGroup::barrier(unsigned int flags) { p_had_barrier = true; // Allocate or reuse TLS memory for the stacks (it isn't freed between // the work groups, and even the kernels, so if we need less space than // allocated, it's good) if (!p_contexts) { if (p_current_work_item != 0) { // Completely abnormal, it means that not every work-items // encounter the barrier std::cerr << "*** Not every work-items of " << p_kernel->function()->getName().str() << " calls barrier(); !" << std::endl; return; } // Allocate or reuse the stacks size_t contexts_size; p_contexts = getWorkItemsData(contexts_size); size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context)); if (!p_contexts || contexts_size < needed_size) { // We must allocate a new space if (p_contexts) munmap(p_contexts, contexts_size); p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); setWorkItemsData(p_contexts, contexts_size); } // Now that we have a real main context, initialize it p_current_context = getContextAddr(0); p_current_context->initialized = 1; std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t)); getcontext(&p_current_context->context); } // Take the next context p_current_work_item++; if (p_current_work_item == p_num_work_items) p_current_work_item = 0; Context *next = getContextAddr(p_current_work_item); Context *main = getContextAddr(0); // The context not created with makecontext // If the next context isn't initialized, initialize it. // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized if (next->initialized == 0) { next->initialized = 1; // local-id of next is the one of the current context, but incVec'ed std::memcpy(next->local_id, p_current_context->local_id, MAX_WORK_DIMS * sizeof(size_t)); incVec(p_work_dim, next->local_id, p_max_local_id); // Initialize the next context if (getcontext(&next->context) != 0) return; // Get its stack. It is located a next + sizeof(Context) char *stack = (char *)next; stack += sizeof(Context); next->context.uc_link = &main->context; next->context.uc_stack.ss_sp = stack; next->context.uc_stack.ss_size = p_stack_size; // Tell it to run the kernel function makecontext(&next->context, (void (*)())p_kernel_func_addr, 1, p_args); } // Switch to the next context ucontext_t *cur = &p_current_context->context; p_current_context = next; swapcontext(cur, &next->context); // When we return here, it means that all the other work items encountered // a barrier and that we returned to this one. We can continue. } void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const { std::cout << "OpenCL: Non-existant builtin function " << name << std::endl; std::cout << " found in " << p_kernel->function()->getName().str() << '.' << std::endl; } /* * Built-in functions */ static size_t get_global_id(cl_uint dimindx) { return g_work_group->getGlobalId(dimindx); } static cl_uint get_work_dim() { return g_work_group->getWorkDim(); } static size_t get_global_size(uint dimindx) { return g_work_group->getGlobalSize(dimindx); } static size_t get_local_size(uint dimindx) { return g_work_group->getLocalSize(dimindx); } static size_t get_local_id(uint dimindx) { return g_work_group->getLocalID(dimindx); } static size_t get_num_groups(uint dimindx) { return g_work_group->getNumGroups(dimindx); } static size_t get_group_id(uint dimindx) { return g_work_group->getGroupID(dimindx); } static size_t get_global_offset(uint dimindx) { return g_work_group->getGlobalOffset(dimindx); } static void barrier(unsigned int flags) { g_work_group->barrier(flags); } // Images static int get_image_width(Image2D *image) { return image->width(); } static int get_image_height(Image2D *image) { return image->height(); } static int get_image_depth(Image3D *image) { if (image->type() != MemObject::Image3D) return 1; return image->depth(); } static int get_image_channel_data_type(Image2D *image) { return image->format().image_channel_data_type; } static int get_image_channel_order(Image2D *image) { return image->format().image_channel_order; } static void *image_data(Image2D *image, int x, int y, int z, int *order, int *type) { *order = image->format().image_channel_order; *type = image->format().image_channel_data_type; return g_work_group->getImageData(image, x, y, z); } static bool is_image_3d(Image3D *image) { return (image->type() == MemObject::Image3D ? 1 : 0); } static void write_imagef(Image2D *image, int x, int y, int z, float *color) { g_work_group->writeImage(image, x, y, z, color); } static void write_imagei(Image2D *image, int x, int y, int z, int32_t *color) { g_work_group->writeImage(image, x, y, z, color); } static void write_imageui(Image2D *image, int x, int y, int z, uint32_t *color) { g_work_group->writeImage(image, x, y, z, color); } static void read_imagefi(float *result, Image2D *image, int x, int y, int z, int32_t sampler) { g_work_group->readImage(result, image, x, y, z, sampler); } static void read_imageii(int32_t *result, Image2D *image, int x, int y, int z, int32_t sampler) { g_work_group->readImage(result, image, x, y, z, sampler); } static void read_imageuii(uint32_t *result, Image2D *image, int x, int y, int z, int32_t sampler) { g_work_group->readImage(result, image, x, y, z, sampler); } static void read_imageff(float *result, Image2D *image, float x, float y, float z, int32_t sampler) { g_work_group->readImage(result, image, x, y, z, sampler); } static void read_imageif(int32_t *result, Image2D *image, float x, float y, float z, int32_t sampler) { g_work_group->readImage(result, image, x, y, z, sampler); } static void read_imageuif(uint32_t *result, Image2D *image, float x, float y, float z, int32_t sampler) { g_work_group->readImage(result, image, x, y, z, sampler); } /* Dummy function to plug missing ARM ABI EH fxns: */ static void dummy_fxn(void) { } /* * Bridge between LLVM and us */ static void unimplemented_stub() { } void debug_ptr(void * arg) { char *s = (char *)arg; float f = *(float *)arg; double d = *(double *)arg; int i = *(int *)arg; } void *getBuiltin(const std::string &name) { if (name == "get_global_id") return (void *)&get_global_id; else if (name == "get_work_dim") return (void *)&get_work_dim; else if (name == "get_global_size") return (void *)&get_global_size; else if (name == "get_local_size") return (void *)&get_local_size; else if (name == "get_local_id") return (void *)&get_local_id; else if (name == "get_num_groups") return (void *)&get_num_groups; else if (name == "get_group_id") return (void *)&get_group_id; else if (name == "get_global_offset") return (void *)&get_global_offset; else if (name == "barrier") return (void *)&barrier; else if (name == "__cpu_get_image_width") return (void *)&get_image_width; else if (name == "__cpu_get_image_height") return (void *)&get_image_height; else if (name == "__cpu_get_image_depth") return (void *)&get_image_depth; else if (name == "__cpu_get_image_channel_data_type") return (void *)&get_image_channel_data_type; else if (name == "__cpu_get_image_channel_order") return (void *)&get_image_channel_order; else if (name == "__cpu_image_data") return (void *)&image_data; else if (name == "__cpu_is_image_3d") return (void *)&is_image_3d; else if (name == "__cpu_write_imagef") return (void *)&write_imagef; else if (name == "__cpu_write_imagei") return (void *)&write_imagei; else if (name == "__cpu_write_imageui") return (void *)&write_imageui; else if (name == "__cpu_read_imagefi") return (void *)&read_imagefi; else if (name == "__cpu_read_imageii") return (void *)&read_imageii; else if (name == "__cpu_read_imageuii") return (void *)&read_imageuii; else if (name == "__cpu_read_imageff") return (void *)&read_imageff; else if (name == "__cpu_read_imageif") return (void *)&read_imageif; else if (name == "__cpu_read_imageuif") return (void *)&read_imageuif; // Generic hook to set debugger bpt to inspect stack variable passed as (void *) else if (name == "debug_ptr") return(void *)&debug_ptr; else if (name == "__aeabi_unwind_cpp_pr0") return (void *)&dummy_fxn; else if (name == "__aeabi_unwind_cpp_pr1") return (void *)&dummy_fxn; else if (name == "__aeabi_unwind_cpp_pr2") return (void *)&dummy_fxn; // Math library disambiguation for OpenCL double functions of the same name. else if (name == "builtin_sincos") return (void *)&sincos; else if (name == "builtin_lgamma_r") return (void *)&lgamma_r; else if (name == "builtin_modf") return (void *)&modf; else if (name == "builtin_remquo") return (void *)&remquo; else if (name == "builtin_pow") return (void *)&pow; else if (name == "builtin_exp10f") return (void *)&exp10f; else if (name == "builtin_exp10") return (void *)&exp10; #if 0 // Other misc functions Khronos tests say are builtins, though not in the spec! else if (name == "memcpy") return (void *)&memcpy; #endif // Function not found g_work_group->builtinNotFound(name); return (void *)&unimplemented_stub; }