diff options
Diffstat (limited to 'final/runtime/test/tasking/kmp_task_reduction_nest.cpp')
-rw-r--r-- | final/runtime/test/tasking/kmp_task_reduction_nest.cpp | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/final/runtime/test/tasking/kmp_task_reduction_nest.cpp b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp new file mode 100644 index 0000000..019a9fe --- /dev/null +++ b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp @@ -0,0 +1,377 @@ +// RUN: %libomp-cxx-compile-and-run +// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run +// REQUIRES: openmp-5.0 +// GCC-5 is needed for OpenMP 4.0 support (taskgroup) +// XFAIL: gcc-4 +#include <cstdio> +#include <cmath> +#include <cassert> +#include <omp.h> + +// Total number of loop iterations, should be multiple of T for this test +#define N 10000 + +// Flag to request lazy (1) or eager (0) allocation of reduction objects +#ifndef FLG +#define FLG 0 +#endif + +/* + // initial user's code that corresponds to pseudo code of the test + #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x) + { + for( int l = 0; l < N; ++l ) { + #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x) + { + i += l; + if( l%2 ) + x *= 1.0 / (l + 1); + else + x *= (l + 1); + } + } + + #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y) + { + for( int l = 0; l < N; ++l ) { + #pragma omp task firstprivate(l) in_reduction(+:j,y) \ + in_reduction(*:x) in_reduction(-:k) + { + j += l; + k -= l; + y += (double)l; + if( l%2 ) + x *= 1.0 / (l + 1); + else + x *= (l + 1); + } + #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k) + { + i -= l; + k -= l; + y += (double)l; + } + #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x) + { + j += l; + if( l%2 ) + x *= 1.0 / (l + 1); + else + x *= (l + 1); + } + } + } // inner reduction + + for( int l = 0; l < N; ++l ) { + #pragma omp task firstprivate(l) in_reduction(+:j) + j += l; + } + } // outer reduction +*/ + +//------------------------------------------------ +// OpenMP runtime library routines +#ifdef __cplusplus +extern "C" { +#endif +extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item); +extern void* __kmpc_task_reduction_init(int gtid, int num, void* data); +extern int __kmpc_global_thread_num(void*); +#ifdef __cplusplus +} +#endif + +//------------------------------------------------ +// Compiler-generated code + +typedef struct _task_red_item { + void *shar; // shared reduction item + size_t size; // size of data item + void *f_init; // data initialization routine + void *f_fini; // data finalization routine + void *f_comb; // data combiner routine + unsigned flags; +} _task_red_item_t; + +// int:+ no need in init/fini callbacks, valid for subtraction +void __red_int_add_comb(void *lhs, void *rhs) // combiner +{ *(int*)lhs += *(int*)rhs; } + +// long long:+ no need in init/fini callbacks, valid for subtraction +void __red_llong_add_comb(void *lhs, void *rhs) // combiner +{ *(long long*)lhs += *(long long*)rhs; } + +// double:* no need in fini callback +void __red_dbl_mul_init(void *data) // initializer +{ *(double*)data = 1.0; } +void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner +{ *(double*)lhs *= *(double*)rhs; } + +// double:+ no need in init/fini callbacks +void __red_dbl_add_comb(void *lhs, void *rhs) // combiner +{ *(double*)lhs += *(double*)rhs; } + +// ============================== + +void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py) +{ + for( int l = 0; l < N; ++l ) { + *pi += l; + if( l%2 ) + *px *= 1.0 / (l + 1); + else + *px *= (l + 1); + } + for( int l = 0; l < N; ++l ) { + *pj += l; + *pk -= l; + *py += (double)l; + if( l%2 ) + *px *= 1.0 / (l + 1); + else + *px *= (l + 1); + + *pi -= l; + *pk -= l; + *py += (double)l; + + *pj += l; + if( l%2 ) + *px *= 1.0 / (l + 1); + else + *px *= (l + 1); + } + for( int l = 0; l < N; ++l ) { + *pj += l; + } +} + +//------------------------------------------------ +// Test case +int main() +{ + int nthreads = omp_get_max_threads(); + int err = 0; + void** ptrs = (void**)malloc(nthreads*sizeof(void*)); + + // user's code ====================================== + // variables for serial calculations: + int is = 3; + long long js = -9999999; + double xs = 99999.0; + long long ks = 99999999; + double ys = -99999999.0; + // variables for parallel calculations: + int ip = 3; + long long jp = -9999999; + double xp = 99999.0; + long long kp = 99999999; + double yp = -99999999.0; + + calc_serial(&is, &js, &xs, &ks, &ys); + // ================================================== + for (int i = 0; i < nthreads; ++i) + ptrs[i] = NULL; + #pragma omp parallel + { + #pragma omp single nowait + { + // outer taskgroup reduces (i,j,x) + #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x) + { + _task_red_item_t red_data[3]; + red_data[0].shar = &ip; + red_data[0].size = sizeof(ip); + red_data[0].f_init = NULL; // RTL will zero thread-specific objects + red_data[0].f_fini = NULL; // no destructors needed + red_data[0].f_comb = (void*)&__red_int_add_comb; + red_data[0].flags = FLG; + red_data[1].shar = &jp; + red_data[1].size = sizeof(jp); + red_data[1].f_init = NULL; // RTL will zero thread-specific objects + red_data[1].f_fini = NULL; // no destructors needed + red_data[1].f_comb = (void*)&__red_llong_add_comb; + red_data[1].flags = FLG; + red_data[2].shar = &xp; + red_data[2].size = sizeof(xp); + red_data[2].f_init = (void*)&__red_dbl_mul_init; + red_data[2].f_fini = NULL; // no destructors needed + red_data[2].f_comb = (void*)&__red_dbl_mul_comb; + red_data[2].flags = FLG; + int gtid = __kmpc_global_thread_num(NULL); + void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data); + + for( int l = 0; l < N; l += 2 ) { + // 2 iterations per task to get correct x value; actually any even + // number of iters per task will work, otherwise x looses precision + #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x) + { + int gtid = __kmpc_global_thread_num(NULL); + int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip); + double *p_xp = (double*)__kmpc_task_reduction_get_th_data( + gtid, tg1, &xp); + if (!ptrs[gtid]) ptrs[gtid] = p_xp; + + // user's pseudo-code ============================== + *p_ip += l; + *p_xp *= (l + 1); + + *p_ip += l + 1; + *p_xp *= 1.0 / (l + 2); + // ================================================== + } + } + // inner taskgroup reduces (i,k,y), i is same object as in outer one + #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y) + { + _task_red_item_t red_data[3]; + red_data[0].shar = &ip; + red_data[0].size = sizeof(ip); + red_data[0].f_init = NULL; // RTL will zero thread-specific objects + red_data[0].f_fini = NULL; // no destructors needed + red_data[0].f_comb = (void*)&__red_int_add_comb; + red_data[0].flags = FLG; + red_data[1].shar = &kp; + red_data[1].size = sizeof(kp); + red_data[1].f_init = NULL; // RTL will zero thread-specific objects + red_data[1].f_fini = NULL; // no destructors needed + red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and - + red_data[1].flags = FLG; + red_data[2].shar = &yp; + red_data[2].size = sizeof(yp); + red_data[2].f_init = NULL; // RTL will zero thread-specific objects + red_data[2].f_fini = NULL; // no destructors needed + red_data[2].f_comb = (void*)&__red_dbl_add_comb; + red_data[2].flags = FLG; + int gtid = __kmpc_global_thread_num(NULL); + void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data); + + for( int l = 0; l < N; l += 2 ) { + #pragma omp task firstprivate(l) + // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k) + { + int gtid = __kmpc_global_thread_num(NULL); + long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( + gtid, tg1, &jp); + long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( + gtid, tg2, &kp); + double *p_xp = (double*)__kmpc_task_reduction_get_th_data( + gtid, tg1, &xp); + double *p_yp = (double*)__kmpc_task_reduction_get_th_data( + gtid, tg2, &yp); + // user's pseudo-code ============================== + *p_jp += l; + *p_kp -= l; + *p_yp += (double)l; + *p_xp *= (l + 1); + + *p_jp += l + 1; + *p_kp -= l + 1; + *p_yp += (double)(l + 1); + *p_xp *= 1.0 / (l + 2); + // ================================================= +{ + // the following code is here just to check __kmpc_task_reduction_get_th_data: + int tid = omp_get_thread_num(); + void *addr1; + void *addr2; + addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared + addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private + if (addr1 != addr2) { + #pragma omp atomic + ++err; + printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2); + } + // from neighbour w/o taskgroup (should start lookup from current tg2) + if (tid > 0) { + if (ptrs[tid-1]) { + addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]); + if (addr1 != addr2) { + #pragma omp atomic + ++err; + printf("Wrong thread-specific addresses %d s:%p n:%p\n", + tid, addr1, addr2); + } + } + } else { + if (ptrs[nthreads-1]) { + addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]); + if (addr1 != addr2) { + #pragma omp atomic + ++err; + printf("Wrong thread-specific addresses %d s:%p n:%p\n", + tid, addr1, addr2); + } + } + } + // ---------------------------------------------- +} + } + #pragma omp task firstprivate(l) + // in_reduction(+:y) in_reduction(-:i,k) + { + int gtid = __kmpc_global_thread_num(NULL); + int *p_ip = (int*)__kmpc_task_reduction_get_th_data( + gtid, tg2, &ip); + long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( + gtid, tg2, &kp); + double *p_yp = (double*)__kmpc_task_reduction_get_th_data( + gtid, tg2, &yp); + + // user's pseudo-code ============================== + *p_ip -= l; + *p_kp -= l; + *p_yp += (double)l; + + *p_ip -= l + 1; + *p_kp -= l + 1; + *p_yp += (double)(l + 1); + // ================================================= + } + #pragma omp task firstprivate(l) + // in_reduction(+:j) in_reduction(*:x) + { + int gtid = __kmpc_global_thread_num(NULL); + long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( + gtid, tg1, &jp); + double *p_xp = (double*)__kmpc_task_reduction_get_th_data( + gtid, tg1, &xp); + // user's pseudo-code ============================== + *p_jp += l; + *p_xp *= (l + 1); + + *p_jp += l + 1; + *p_xp *= 1.0 / (l + 2); + // ================================================= + } + } + } // inner reduction + + for( int l = 0; l < N; l += 2 ) { + #pragma omp task firstprivate(l) // in_reduction(+:j) + { + int gtid = __kmpc_global_thread_num(NULL); + long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( + gtid, tg1, &jp); + // user's pseudo-code ============================== + *p_jp += l; + *p_jp += l + 1; + // ================================================= + } + } + } // outer reduction + } // end single + } // end parallel + // check results +#if _DEBUG + printf("reduction flags = %u\n", FLG); +#endif + if (ip == is && jp == js && ks == kp && + fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01) + printf("passed\n"); + else + printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n", + is, js, xs, ks, ys, + ip, jp, xp, kp, yp); + return 0; +} |