aboutsummaryrefslogtreecommitdiff
path: root/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'final/runtime/test/tasking/kmp_task_reduction_nest.cpp')
-rw-r--r--final/runtime/test/tasking/kmp_task_reduction_nest.cpp377
1 files changed, 377 insertions, 0 deletions
diff --git a/final/runtime/test/tasking/kmp_task_reduction_nest.cpp b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
new file mode 100644
index 0000000..019a9fe
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
@@ -0,0 +1,377 @@
+// RUN: %libomp-cxx-compile-and-run
+// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
+// REQUIRES: openmp-5.0
+// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
+// XFAIL: gcc-4
+#include <cstdio>
+#include <cmath>
+#include <cassert>
+#include <omp.h>
+
+// Total number of loop iterations, should be multiple of T for this test
+#define N 10000
+
+// Flag to request lazy (1) or eager (0) allocation of reduction objects
+#ifndef FLG
+#define FLG 0
+#endif
+
+/*
+ // initial user's code that corresponds to pseudo code of the test
+ #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
+ {
+ for( int l = 0; l < N; ++l ) {
+ #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
+ {
+ i += l;
+ if( l%2 )
+ x *= 1.0 / (l + 1);
+ else
+ x *= (l + 1);
+ }
+ }
+
+ #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
+ {
+ for( int l = 0; l < N; ++l ) {
+ #pragma omp task firstprivate(l) in_reduction(+:j,y) \
+ in_reduction(*:x) in_reduction(-:k)
+ {
+ j += l;
+ k -= l;
+ y += (double)l;
+ if( l%2 )
+ x *= 1.0 / (l + 1);
+ else
+ x *= (l + 1);
+ }
+ #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
+ {
+ i -= l;
+ k -= l;
+ y += (double)l;
+ }
+ #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
+ {
+ j += l;
+ if( l%2 )
+ x *= 1.0 / (l + 1);
+ else
+ x *= (l + 1);
+ }
+ }
+ } // inner reduction
+
+ for( int l = 0; l < N; ++l ) {
+ #pragma omp task firstprivate(l) in_reduction(+:j)
+ j += l;
+ }
+ } // outer reduction
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
+extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
+extern int __kmpc_global_thread_num(void*);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct _task_red_item {
+ void *shar; // shared reduction item
+ size_t size; // size of data item
+ void *f_init; // data initialization routine
+ void *f_fini; // data finalization routine
+ void *f_comb; // data combiner routine
+ unsigned flags;
+} _task_red_item_t;
+
+// int:+ no need in init/fini callbacks, valid for subtraction
+void __red_int_add_comb(void *lhs, void *rhs) // combiner
+{ *(int*)lhs += *(int*)rhs; }
+
+// long long:+ no need in init/fini callbacks, valid for subtraction
+void __red_llong_add_comb(void *lhs, void *rhs) // combiner
+{ *(long long*)lhs += *(long long*)rhs; }
+
+// double:* no need in fini callback
+void __red_dbl_mul_init(void *data) // initializer
+{ *(double*)data = 1.0; }
+void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs *= *(double*)rhs; }
+
+// double:+ no need in init/fini callbacks
+void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs += *(double*)rhs; }
+
+// ==============================
+
+void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
+{
+ for( int l = 0; l < N; ++l ) {
+ *pi += l;
+ if( l%2 )
+ *px *= 1.0 / (l + 1);
+ else
+ *px *= (l + 1);
+ }
+ for( int l = 0; l < N; ++l ) {
+ *pj += l;
+ *pk -= l;
+ *py += (double)l;
+ if( l%2 )
+ *px *= 1.0 / (l + 1);
+ else
+ *px *= (l + 1);
+
+ *pi -= l;
+ *pk -= l;
+ *py += (double)l;
+
+ *pj += l;
+ if( l%2 )
+ *px *= 1.0 / (l + 1);
+ else
+ *px *= (l + 1);
+ }
+ for( int l = 0; l < N; ++l ) {
+ *pj += l;
+ }
+}
+
+//------------------------------------------------
+// Test case
+int main()
+{
+ int nthreads = omp_get_max_threads();
+ int err = 0;
+ void** ptrs = (void**)malloc(nthreads*sizeof(void*));
+
+ // user's code ======================================
+ // variables for serial calculations:
+ int is = 3;
+ long long js = -9999999;
+ double xs = 99999.0;
+ long long ks = 99999999;
+ double ys = -99999999.0;
+ // variables for parallel calculations:
+ int ip = 3;
+ long long jp = -9999999;
+ double xp = 99999.0;
+ long long kp = 99999999;
+ double yp = -99999999.0;
+
+ calc_serial(&is, &js, &xs, &ks, &ys);
+ // ==================================================
+ for (int i = 0; i < nthreads; ++i)
+ ptrs[i] = NULL;
+ #pragma omp parallel
+ {
+ #pragma omp single nowait
+ {
+ // outer taskgroup reduces (i,j,x)
+ #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
+ {
+ _task_red_item_t red_data[3];
+ red_data[0].shar = &ip;
+ red_data[0].size = sizeof(ip);
+ red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+ red_data[0].f_fini = NULL; // no destructors needed
+ red_data[0].f_comb = (void*)&__red_int_add_comb;
+ red_data[0].flags = FLG;
+ red_data[1].shar = &jp;
+ red_data[1].size = sizeof(jp);
+ red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+ red_data[1].f_fini = NULL; // no destructors needed
+ red_data[1].f_comb = (void*)&__red_llong_add_comb;
+ red_data[1].flags = FLG;
+ red_data[2].shar = &xp;
+ red_data[2].size = sizeof(xp);
+ red_data[2].f_init = (void*)&__red_dbl_mul_init;
+ red_data[2].f_fini = NULL; // no destructors needed
+ red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
+ red_data[2].flags = FLG;
+ int gtid = __kmpc_global_thread_num(NULL);
+ void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+ for( int l = 0; l < N; l += 2 ) {
+ // 2 iterations per task to get correct x value; actually any even
+ // number of iters per task will work, otherwise x looses precision
+ #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
+ double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+ gtid, tg1, &xp);
+ if (!ptrs[gtid]) ptrs[gtid] = p_xp;
+
+ // user's pseudo-code ==============================
+ *p_ip += l;
+ *p_xp *= (l + 1);
+
+ *p_ip += l + 1;
+ *p_xp *= 1.0 / (l + 2);
+ // ==================================================
+ }
+ }
+ // inner taskgroup reduces (i,k,y), i is same object as in outer one
+ #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
+ {
+ _task_red_item_t red_data[3];
+ red_data[0].shar = &ip;
+ red_data[0].size = sizeof(ip);
+ red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+ red_data[0].f_fini = NULL; // no destructors needed
+ red_data[0].f_comb = (void*)&__red_int_add_comb;
+ red_data[0].flags = FLG;
+ red_data[1].shar = &kp;
+ red_data[1].size = sizeof(kp);
+ red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+ red_data[1].f_fini = NULL; // no destructors needed
+ red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
+ red_data[1].flags = FLG;
+ red_data[2].shar = &yp;
+ red_data[2].size = sizeof(yp);
+ red_data[2].f_init = NULL; // RTL will zero thread-specific objects
+ red_data[2].f_fini = NULL; // no destructors needed
+ red_data[2].f_comb = (void*)&__red_dbl_add_comb;
+ red_data[2].flags = FLG;
+ int gtid = __kmpc_global_thread_num(NULL);
+ void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+ for( int l = 0; l < N; l += 2 ) {
+ #pragma omp task firstprivate(l)
+ // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+ gtid, tg1, &jp);
+ long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+ gtid, tg2, &kp);
+ double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+ gtid, tg1, &xp);
+ double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+ gtid, tg2, &yp);
+ // user's pseudo-code ==============================
+ *p_jp += l;
+ *p_kp -= l;
+ *p_yp += (double)l;
+ *p_xp *= (l + 1);
+
+ *p_jp += l + 1;
+ *p_kp -= l + 1;
+ *p_yp += (double)(l + 1);
+ *p_xp *= 1.0 / (l + 2);
+ // =================================================
+{
+ // the following code is here just to check __kmpc_task_reduction_get_th_data:
+ int tid = omp_get_thread_num();
+ void *addr1;
+ void *addr2;
+ addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
+ addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
+ if (addr1 != addr2) {
+ #pragma omp atomic
+ ++err;
+ printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
+ }
+ // from neighbour w/o taskgroup (should start lookup from current tg2)
+ if (tid > 0) {
+ if (ptrs[tid-1]) {
+ addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
+ if (addr1 != addr2) {
+ #pragma omp atomic
+ ++err;
+ printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+ tid, addr1, addr2);
+ }
+ }
+ } else {
+ if (ptrs[nthreads-1]) {
+ addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
+ if (addr1 != addr2) {
+ #pragma omp atomic
+ ++err;
+ printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+ tid, addr1, addr2);
+ }
+ }
+ }
+ // ----------------------------------------------
+}
+ }
+ #pragma omp task firstprivate(l)
+ // in_reduction(+:y) in_reduction(-:i,k)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
+ gtid, tg2, &ip);
+ long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+ gtid, tg2, &kp);
+ double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+ gtid, tg2, &yp);
+
+ // user's pseudo-code ==============================
+ *p_ip -= l;
+ *p_kp -= l;
+ *p_yp += (double)l;
+
+ *p_ip -= l + 1;
+ *p_kp -= l + 1;
+ *p_yp += (double)(l + 1);
+ // =================================================
+ }
+ #pragma omp task firstprivate(l)
+ // in_reduction(+:j) in_reduction(*:x)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+ gtid, tg1, &jp);
+ double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+ gtid, tg1, &xp);
+ // user's pseudo-code ==============================
+ *p_jp += l;
+ *p_xp *= (l + 1);
+
+ *p_jp += l + 1;
+ *p_xp *= 1.0 / (l + 2);
+ // =================================================
+ }
+ }
+ } // inner reduction
+
+ for( int l = 0; l < N; l += 2 ) {
+ #pragma omp task firstprivate(l) // in_reduction(+:j)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+ gtid, tg1, &jp);
+ // user's pseudo-code ==============================
+ *p_jp += l;
+ *p_jp += l + 1;
+ // =================================================
+ }
+ }
+ } // outer reduction
+ } // end single
+ } // end parallel
+ // check results
+#if _DEBUG
+ printf("reduction flags = %u\n", FLG);
+#endif
+ if (ip == is && jp == js && ks == kp &&
+ fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
+ printf("passed\n");
+ else
+ printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
+ is, js, xs, ks, ys,
+ ip, jp, xp, kp, yp);
+ return 0;
+}