20 files changed, 1609 insertions, 0 deletions
diff --git a/final/runtime/test/tasking/bug_36720.c b/final/runtime/test/tasking/bug_36720.c
new file mode 100644
index 0000000..684d675
--- /dev/null
+++ b/final/runtime/test/tasking/bug_36720.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run
+
+/*
+Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=36720
+
+Assertion failure at kmp_runtime.cpp(1715): nthreads > 0.
+OMP: Error #13: Assertion failure at kmp_runtime.cpp(1715).
+
+The assertion fails even with OMP_NUM_THREADS=1. If the second task is removed,
+everything runs to completion. If the "omp parallel for" directives are removed
+from inside the tasks, once again everything runs fine.
+*/
+
+#define N 1024
+
+int main() {
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; i++)
+      (void)0;
+  }
+
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; ++i)
+      (void)0;
+  }
+
+  #pragma omp taskwait
+
+  return 0;
+}
diff --git a/final/runtime/test/tasking/bug_nested_proxy_task.c b/final/runtime/test/tasking/bug_nested_proxy_task.c
new file mode 100644
index 0000000..6c00822
--- /dev/null
+++ b/final/runtime/test/tasking/bug_nested_proxy_task.c
@@ -0,0 +1,131 @@
+// RUN: %libomp-compile -lpthread && %libomp-run
+// The runtime currently does not get dependency information from GCC.
+// UNSUPPORTED: gcc
+
+#include <stdio.h>
+#include <omp.h>
+#include <pthread.h>
+#include "omp_my_sleep.h"
+
+/*
+ With task dependencies one can generate proxy tasks from an explicit task
+ being executed by a serial task team. The OpenMP runtime library didn't
+ expect that and tries to free the explicit task that is the parent of the
+ proxy task still working in background. It therefore has incomplete children
+ which triggers a debugging assertion.
+*/
+
+// Compiler-generated code (emulation)
+typedef long kmp_intptr_t;
+typedef int kmp_int32;
+
+typedef char bool;
+
+typedef struct ident {
+    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
+    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
+    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+                            /*  but currently used for storing region-specific ITT */
+                            /*  contextual information. */
+#endif /* USE_ITT_BUILD */
+    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
+    char const *psource;    /**< String describing the source location.
+                            The string is composed of semi-colon separated fields which describe the source file,
+                            the function and a pair of line numbers that delimit the construct.
+                             */
+} ident_t;
+
+typedef struct kmp_depend_info {
+     kmp_intptr_t               base_addr;
+     size_t                     len;
+     struct {
+         bool                   in:1;
+         bool                   out:1;
+     } flags;
+} kmp_depend_info_t;
+
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+
+typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
+    void *              shareds;            /**< pointer to block of pointers to shared vars   */
+    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
+    kmp_int32           part_id;            /**< part id for the task                          */
+} kmp_task_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+kmp_int32  __kmpc_global_thread_num  ( ident_t * );
+kmp_task_t*
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry );
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+kmp_int32 __kmpc_omp_task_with_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                      kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                      kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+#ifdef __cplusplus
+}
+#endif
+
+void *target(void *task)
+{
+    my_sleep( 0.1 );
+    __kmpc_proxy_task_completed_ooo((kmp_task_t*) task);
+    return NULL;
+}
+
+pthread_t target_thread;
+
+// User's code
+int task_entry(kmp_int32 gtid, kmp_task_t *task)
+{
+    pthread_create(&target_thread, NULL, &target, task);
+    return 0;
+}
+
+int main()
+{
+    int dep;
+
+#pragma omp taskgroup
+{
+/*
+ *  Corresponds to:
+    #pragma omp target nowait depend(out: dep)
+    {
+        my_sleep( 0.1 );
+    }
+*/
+    kmp_depend_info_t dep_info;
+    dep_info.base_addr = (long) &dep;
+    dep_info.len = sizeof(int);
+    // out = inout per spec and runtime expects this
+    dep_info.flags.in = 1;
+    dep_info.flags.out = 1;
+
+    kmp_int32 gtid = __kmpc_global_thread_num(NULL);
+    kmp_task_t *proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+    __kmpc_omp_task_with_deps(NULL,gtid,proxy_task,1,&dep_info,0,NULL);
+
+    #pragma omp task depend(in: dep)
+    {
+/*
+ *      Corresponds to:
+        #pragma omp target nowait
+        {
+            my_sleep( 0.1 );
+        }
+*/
+        kmp_task_t *nested_proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+        __kmpc_omp_task(NULL,gtid,nested_proxy_task);
+    }
+}
+
+    // only check that it didn't crash
+    return 0;
+}
diff --git a/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c b/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c
new file mode 100644
index 0000000..e6dd895
--- /dev/null
+++ b/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c
@@ -0,0 +1,134 @@
+// RUN: %libomp-compile -lpthread && %libomp-run
+// The runtime currently does not get dependency information from GCC.
+// UNSUPPORTED: gcc
+
+#include <stdio.h>
+#include <omp.h>
+#include <pthread.h>
+#include "omp_my_sleep.h"
+
+/*
+ An explicit task can have a dependency on a target task. If it is not
+ directly satisfied, the runtime should not wait but resume execution.
+*/
+
+// Compiler-generated code (emulation)
+typedef long kmp_intptr_t;
+typedef int kmp_int32;
+
+typedef char bool;
+
+typedef struct ident {
+    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
+    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
+    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+                            /*  but currently used for storing region-specific ITT */
+                            /*  contextual information. */
+#endif /* USE_ITT_BUILD */
+    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
+    char const *psource;    /**< String describing the source location.
+                            The string is composed of semi-colon separated fields which describe the source file,
+                            the function and a pair of line numbers that delimit the construct.
+                             */
+} ident_t;
+
+typedef struct kmp_depend_info {
+     kmp_intptr_t               base_addr;
+     size_t                     len;
+     struct {
+         bool                   in:1;
+         bool                   out:1;
+     } flags;
+} kmp_depend_info_t;
+
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+
+typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
+    void *              shareds;            /**< pointer to block of pointers to shared vars   */
+    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
+    kmp_int32           part_id;            /**< part id for the task                          */
+} kmp_task_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+kmp_int32  __kmpc_global_thread_num  ( ident_t * );
+kmp_task_t*
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry );
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+kmp_int32 __kmpc_omp_task_with_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                      kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                      kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+#ifdef __cplusplus
+}
+#endif
+
+void *target(void *task)
+{
+    my_sleep( 0.1 );
+    __kmpc_proxy_task_completed_ooo((kmp_task_t*) task);
+    return NULL;
+}
+
+pthread_t target_thread;
+
+// User's code
+int task_entry(kmp_int32 gtid, kmp_task_t *task)
+{
+    pthread_create(&target_thread, NULL, &target, task);
+    return 0;
+}
+
+int main()
+{
+    int dep;
+
+/*
+ *  Corresponds to:
+    #pragma omp target nowait depend(out: dep)
+    {
+        my_sleep( 0.1 );
+    }
+*/
+    kmp_depend_info_t dep_info;
+    dep_info.base_addr = (long) &dep;
+    dep_info.len = sizeof(int);
+    // out = inout per spec and runtime expects this
+    dep_info.flags.in = 1;
+    dep_info.flags.out = 1;
+
+    kmp_int32 gtid = __kmpc_global_thread_num(NULL);
+    kmp_task_t *proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+    __kmpc_omp_task_with_deps(NULL,gtid,proxy_task,1,&dep_info,0,NULL);
+
+    int first_task_finished = 0;
+    #pragma omp task shared(first_task_finished) depend(inout: dep)
+    {
+        first_task_finished = 1;
+    }
+
+    int second_task_finished = 0;
+    #pragma omp task shared(second_task_finished) depend(in: dep)
+    {
+        second_task_finished = 1;
+    }
+
+    // check that execution has been resumed and the runtime has not waited
+    // for the dependencies to be satisfied.
+    int error = (first_task_finished == 1);
+    error += (second_task_finished == 1);
+
+    #pragma omp taskwait
+
+    // by now all tasks should have finished
+    error += (first_task_finished != 1);
+    error += (second_task_finished != 1);
+
+    return error;
+}
diff --git a/final/runtime/test/tasking/bug_serial_taskgroup.c b/final/runtime/test/tasking/bug_serial_taskgroup.c
new file mode 100644
index 0000000..850bc90
--- /dev/null
+++ b/final/runtime/test/tasking/bug_serial_taskgroup.c
@@ -0,0 +1,16 @@
+// RUN: %libomp-compile-and-run
+
+/*
+ GCC failed this test because __kmp_get_gtid() instead of __kmp_entry_gtid()
+ was called in xexpand(KMP_API_NAME_GOMP_TASKGROUP_START)(void).
+ __kmp_entry_gtid() will initialize the runtime if not yet done which does not
+ happen with __kmp_get_gtid().
+ */
+
+int main()
+{
+    #pragma omp taskgroup
+    { }
+
+    return 0;
+}
diff --git a/final/runtime/test/tasking/kmp_task_reduction_nest.cpp b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
new file mode 100644
index 0000000..63dffe4
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
@@ -0,0 +1,376 @@
+// RUN: %libomp-cxx-compile-and-run
+// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
+// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
+// XFAIL: gcc-4
+#include <cstdio>
+#include <cmath>
+#include <cassert>
+#include <omp.h>
+
+// Total number of loop iterations, should be multiple of T for this test
+#define N 10000
+
+// Flag to request lazy (1) or eager (0) allocation of reduction objects
+#ifndef FLG
+#define FLG 0
+#endif
+
+/*
+  // initial user's code that corresponds to pseudo code of the test
+  #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
+  {
+    for( int l = 0; l < N; ++l ) {
+      #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
+      {
+        i += l;
+        if( l%2 )
+          x *= 1.0 / (l + 1);
+        else
+          x *= (l + 1);
+      }
+    }
+
+    #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
+    {
+      for( int l = 0; l < N; ++l ) {
+        #pragma omp task firstprivate(l) in_reduction(+:j,y) \
+            in_reduction(*:x) in_reduction(-:k)
+        {
+          j += l;
+          k -= l;
+          y += (double)l;
+          if( l%2 )
+            x *= 1.0 / (l + 1);
+          else
+            x *= (l + 1);
+        }
+        #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
+        {
+          i -= l;
+          k -= l;
+          y += (double)l;
+        }
+        #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
+        {
+          j += l;
+          if( l%2 )
+            x *= 1.0 / (l + 1);
+          else
+            x *= (l + 1);
+        }
+      }
+    } // inner reduction
+
+    for( int l = 0; l < N; ++l ) {
+      #pragma omp task firstprivate(l) in_reduction(+:j)
+        j += l;
+    }
+  } // outer reduction
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
+extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
+extern int __kmpc_global_thread_num(void*);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct _task_red_item {
+    void       *shar; // shared reduction item
+    size_t      size; // size of data item
+    void       *f_init; // data initialization routine
+    void       *f_fini; // data finalization routine
+    void       *f_comb; // data combiner routine
+    unsigned    flags;
+} _task_red_item_t;
+
+// int:+   no need in init/fini callbacks, valid for subtraction
+void __red_int_add_comb(void *lhs, void *rhs) // combiner
+{ *(int*)lhs += *(int*)rhs; }
+
+// long long:+   no need in init/fini callbacks, valid for subtraction
+void __red_llong_add_comb(void *lhs, void *rhs) // combiner
+{ *(long long*)lhs += *(long long*)rhs; }
+
+// double:*   no need in fini callback
+void __red_dbl_mul_init(void *data) // initializer
+{ *(double*)data = 1.0; }
+void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs *= *(double*)rhs; }
+
+// double:+   no need in init/fini callbacks
+void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs += *(double*)rhs; }
+
+// ==============================
+
+void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
+{
+    for( int l = 0; l < N; ++l ) {
+        *pi += l;
+        if( l%2 )
+          *px *= 1.0 / (l + 1);
+        else
+          *px *= (l + 1);
+    }
+    for( int l = 0; l < N; ++l ) {
+        *pj += l;
+        *pk -= l;
+        *py += (double)l;
+        if( l%2 )
+            *px *= 1.0 / (l + 1);
+        else
+            *px *= (l + 1);
+
+        *pi -= l;
+        *pk -= l;
+        *py += (double)l;
+
+        *pj += l;
+        if( l%2 )
+            *px *= 1.0 / (l + 1);
+        else
+            *px *= (l + 1);
+    }
+    for( int l = 0; l < N; ++l ) {
+        *pj += l;
+    }
+}
+
+//------------------------------------------------
+// Test case
+int main()
+{
+  int nthreads = omp_get_max_threads();
+  int err = 0;
+  void** ptrs = (void**)malloc(nthreads*sizeof(void*));
+
+  // user's code ======================================
+  // variables for serial calculations:
+  int is = 3;
+  long long js = -9999999;
+  double xs = 99999.0;
+  long long ks = 99999999;
+  double ys = -99999999.0;
+  // variables for parallel calculations:
+  int ip = 3;
+  long long jp = -9999999;
+  double xp = 99999.0;
+  long long kp = 99999999;
+  double yp = -99999999.0;
+
+  calc_serial(&is, &js, &xs, &ks, &ys);
+  // ==================================================
+  for (int i = 0; i < nthreads; ++i)
+    ptrs[i] = NULL;
+  #pragma omp parallel
+  {
+    #pragma omp single nowait
+    {
+      // outer taskgroup reduces (i,j,x)
+      #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
+      {
+        _task_red_item_t red_data[3];
+        red_data[0].shar = &ip;
+        red_data[0].size = sizeof(ip);
+        red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+        red_data[0].f_fini = NULL; // no destructors needed
+        red_data[0].f_comb = (void*)&__red_int_add_comb;
+        red_data[0].flags = FLG;
+        red_data[1].shar = &jp;
+        red_data[1].size = sizeof(jp);
+        red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+        red_data[1].f_fini = NULL; // no destructors needed
+        red_data[1].f_comb = (void*)&__red_llong_add_comb;
+        red_data[1].flags = FLG;
+        red_data[2].shar = &xp;
+        red_data[2].size = sizeof(xp);
+        red_data[2].f_init = (void*)&__red_dbl_mul_init;
+        red_data[2].f_fini = NULL; // no destructors needed
+        red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
+        red_data[2].flags = FLG;
+        int gtid = __kmpc_global_thread_num(NULL);
+        void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+        for( int l = 0; l < N; l += 2 ) {
+          // 2 iterations per task to get correct x value; actually any even
+          // number of iters per task will work, otherwise x looses precision
+          #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
+          {
+            int gtid = __kmpc_global_thread_num(NULL);
+            int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
+            double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                        gtid, tg1, &xp);
+            if (!ptrs[gtid]) ptrs[gtid] = p_xp;
+
+            // user's pseudo-code ==============================
+            *p_ip += l;
+            *p_xp *= (l + 1);
+
+            *p_ip += l + 1;
+            *p_xp *= 1.0 / (l + 2);
+            // ==================================================
+          }
+        }
+        // inner taskgroup reduces (i,k,y), i is same object as in outer one
+        #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
+        {
+          _task_red_item_t red_data[3];
+          red_data[0].shar = &ip;
+          red_data[0].size = sizeof(ip);
+          red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[0].f_fini = NULL; // no destructors needed
+          red_data[0].f_comb = (void*)&__red_int_add_comb;
+          red_data[0].flags = FLG;
+          red_data[1].shar = &kp;
+          red_data[1].size = sizeof(kp);
+          red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[1].f_fini = NULL; // no destructors needed
+          red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
+          red_data[1].flags = FLG;
+          red_data[2].shar = &yp;
+          red_data[2].size = sizeof(yp);
+          red_data[2].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[2].f_fini = NULL; // no destructors needed
+          red_data[2].f_comb = (void*)&__red_dbl_add_comb;
+          red_data[2].flags = FLG;
+          int gtid = __kmpc_global_thread_num(NULL);
+          void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+          for( int l = 0; l < N; l += 2 ) {
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg1, &jp);
+              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg2, &kp);
+              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg1, &xp);
+              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg2, &yp);
+              // user's pseudo-code ==============================
+              *p_jp += l;
+              *p_kp -= l;
+              *p_yp += (double)l;
+              *p_xp *= (l + 1);
+
+              *p_jp += l + 1;
+              *p_kp -= l + 1;
+              *p_yp += (double)(l + 1);
+              *p_xp *= 1.0 / (l + 2);
+              // =================================================
+{
+  // the following code is here just to check __kmpc_task_reduction_get_th_data:
+  int tid = omp_get_thread_num();
+  void *addr1;
+  void *addr2;
+  addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
+  addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
+  if (addr1 != addr2) {
+    #pragma omp atomic
+      ++err;
+    printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
+  }
+  // from neighbour w/o taskgroup (should start lookup from current tg2)
+  if (tid > 0) {
+    if (ptrs[tid-1]) {
+      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
+      if (addr1 != addr2) {
+        #pragma omp atomic
+          ++err;
+        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+               tid, addr1, addr2);
+      }
+    }
+  } else {
+    if (ptrs[nthreads-1]) {
+      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
+      if (addr1 != addr2) {
+        #pragma omp atomic
+          ++err;
+        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+               tid, addr1, addr2);
+      }
+    }
+  }
+  // ----------------------------------------------
+}
+            }
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:y) in_reduction(-:i,k)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
+                                    gtid, tg2, &ip);
+              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg2, &kp);
+              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg2, &yp);
+
+              // user's pseudo-code ==============================
+              *p_ip -= l;
+              *p_kp -= l;
+              *p_yp += (double)l;
+
+              *p_ip -= l + 1;
+              *p_kp -= l + 1;
+              *p_yp += (double)(l + 1);
+              // =================================================
+            }
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:j) in_reduction(*:x)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg1, &jp);
+              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg1, &xp);
+              // user's pseudo-code ==============================
+              *p_jp += l;
+              *p_xp *= (l + 1);
+
+              *p_jp += l + 1;
+              *p_xp *= 1.0 / (l + 2);
+              // =================================================
+            }
+          }
+        } // inner reduction
+
+        for( int l = 0; l < N; l += 2 ) {
+          #pragma omp task firstprivate(l) // in_reduction(+:j)
+          {
+            int gtid = __kmpc_global_thread_num(NULL);
+            long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                              gtid, tg1, &jp);
+            // user's pseudo-code ==============================
+            *p_jp += l;
+            *p_jp += l + 1;
+            // =================================================
+          }
+        }
+      } // outer reduction
+    } // end single
+  } // end parallel
+  // check results
+#if _DEBUG
+  printf("reduction flags = %u\n", FLG);
+#endif
+  if (ip == is && jp == js && ks == kp &&
+      fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
+    printf("passed\n");
+  else
+    printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
+      is, js, xs, ks, ys,
+      ip, jp, xp, kp, yp);
+  return 0;
+}
diff --git a/final/runtime/test/tasking/kmp_taskloop.c b/final/runtime/test/tasking/kmp_taskloop.c
new file mode 100644
index 0000000..4b13793
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_taskloop.c
@@ -0,0 +1,159 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+#define N 4
+#define GRAIN 10
+#define STRIDE 3
+
+// globals
+int th_counter[N];
+int counter;
+
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+typedef struct shar {
+    int(*pth_counter)[N];
+    int *pcounter;
+    int *pj;
+} *pshareds;
+
+typedef struct task {
+    pshareds shareds;
+    int(* routine)(int,struct task*);
+    int part_id;
+// privates:
+    unsigned long long lb; // library always uses ULONG
+    unsigned long long ub;
+    int st;
+    int last;
+    int i;
+    int j;
+    int th;
+} *ptask, kmp_task_t;
+
+typedef int(* task_entry_t)( int, ptask );
+
+void
+__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
+{
+// setup lastprivate flag
+    task_dst->last = lastpriv;
+// could be constructor calls here...
+}
+
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int nogroup, int sched, kmp_int64 grainsize, void *task_dup );
+ptask
+__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags,
+                  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                  task_entry_t task_entry );
+void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
+int  __kmpc_global_thread_num(void *id_ref);
+#ifdef __cplusplus
+}
+#endif
+
+
+// User's code
+int task_entry(int gtid, ptask task)
+{
+    pshareds pshar = task->shareds;
+    for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) {
+        task->th = omp_get_thread_num();
+        __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
+        __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1);
+        task->j = task->i;
+    }
+    my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
+    if( task->last ) {
+        *(pshar->pj) = task->j; // lastprivate
+    }
+    return 0;
+}
+
+int main()
+{
+    int i, j, gtid = __kmpc_global_thread_num(NULL);
+    ptask task;
+    pshareds psh;
+    omp_set_dynamic(0);
+    counter = 0;
+    for( i=0; i<N; ++i )
+        th_counter[i] = 0;
+    #pragma omp parallel num_threads(N)
+    {
+      #pragma omp master
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+/*
+ *  This is what the OpenMP runtime calls correspond to:
+    #pragma omp taskloop num_tasks(N) lastprivate(j)
+    for( i=0; i<N*GRAIN*STRIDE-1; i+=STRIDE )
+    {
+        int th = omp_get_thread_num();
+        #pragma omp atomic
+            counter++;
+        #pragma omp atomic
+            th_counter[th]++;
+        j = i;
+    }
+*/
+    task = __kmpc_omp_task_alloc(NULL,gtid,1,sizeof(struct task),sizeof(struct shar),&task_entry);
+    psh = task->shareds;
+    psh->pth_counter = &th_counter;
+    psh->pcounter = &counter;
+    psh->pj = &j;
+    task->lb = 0;
+    task->ub = N*GRAIN*STRIDE-2;
+    task->st = STRIDE;
+
+    __kmpc_taskloop(
+        NULL,             // location
+        gtid,             // gtid
+        task,             // task structure
+        1,                // if clause value
+        &task->lb,        // lower bound
+        &task->ub,        // upper bound
+        STRIDE,           // loop increment
+        0,                // 1 if nogroup specified
+        2,                // schedule type: 0-none, 1-grainsize, 2-num_tasks
+        N,                // schedule value (ignored for type 0)
+        (void*)&__task_dup_entry // tasks duplication routine
+        );
+      } // end master
+    } // end parallel
+// check results
+    if( j != N*GRAIN*STRIDE-STRIDE ) {
+        printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE);
+        return 1;
+    }
+    if( counter != N*GRAIN ) {
+        printf("Error, counter %d != %d\n",counter,N*GRAIN);
+        return 1;
+    }
+    for( i=0; i<N; ++i ) {
+        if( th_counter[i] % GRAIN ) {
+            printf("Error, th_counter[%d] = %d\n",i,th_counter[i]);
+            return 1;
+        }
+    }
+    printf("passed\n");
+    return 0;
+}
diff --git a/final/runtime/test/tasking/nested_parallel_tasking.c b/final/runtime/test/tasking/nested_parallel_tasking.c
new file mode 100644
index 0000000..4374d6e
--- /dev/null
+++ b/final/runtime/test/tasking/nested_parallel_tasking.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+
+/*
+ * This test would hang when level instead of active level
+ * used to push task state.
+ */
+
+int main()
+{
+  // If num_threads is changed to a value greater than 1, then the test passes
+  #pragma omp parallel num_threads(1)
+  {
+    #pragma omp parallel
+    printf("Hello World from thread %d\n", omp_get_thread_num());
+  }
+
+  printf("omp_num_threads: %d\n", omp_get_max_threads());
+
+  #pragma omp parallel
+  {
+    #pragma omp master
+    #pragma omp task default(none)
+    {
+      printf("%d is executing this task\n", omp_get_thread_num());
+    }
+  }
+
+  printf("pass\n");
+  return 0;
+}
diff --git a/final/runtime/test/tasking/nested_task_creation.c b/final/runtime/test/tasking/nested_task_creation.c
new file mode 100644
index 0000000..c7c25fc
--- /dev/null
+++ b/final/runtime/test/tasking/nested_task_creation.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+/*
+ * This test creates tasks that themselves create a new task.
+ * The runtime has to take care that they are correctly freed.
+ */
+
+int main()
+{
+  #pragma omp task
+  {
+    #pragma omp task
+    {
+      my_sleep( 0.1 );
+    }
+  }
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp single
+    #pragma omp task
+    {
+      #pragma omp task
+      {
+        my_sleep( 0.1 );
+      }
+    }
+  }
+
+  printf("pass\n");
+  return 0;
+}
diff --git a/final/runtime/test/tasking/omp_task.c b/final/runtime/test/tasking/omp_task.c
new file mode 100644
index 0000000..c534abe
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task()
+{
+  int tids[NUM_TASKS];
+  int i;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          my_sleep (SLEEPTIME);
+          tids[myi] = omp_get_thread_num();
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* Now we ckeck if more than one thread executed the tasks. */
+  for (i = 1; i < NUM_TASKS; i++) {
+    if (tids[0] != tids[i])
+      return 1;
+  }
+  return 0;
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_final.c b/final/runtime/test/tasking/omp_task_final.c
new file mode 100644
index 0000000..b531af6
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_final.c
@@ -0,0 +1,65 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task_final()
+{
+  int tids[NUM_TASKS];
+  int includedtids[NUM_TASKS];
+  int i;
+  int error = 0;
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+
+        #pragma omp task final(i>=10)
+        {
+          tids[myi] = omp_get_thread_num();
+          /* we generate included tasks for final tasks */
+          if(myi >= 10) {
+            int included = myi;
+            #pragma omp task
+            {
+              my_sleep (SLEEPTIME);
+              includedtids[included] = omp_get_thread_num();
+            } /* end of omp included task of the final task */
+            my_sleep (SLEEPTIME);
+          } /* end of if it is a final task*/
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* Now we ckeck if more than one thread executed the final task and its included task. */
+  for (i = 10; i < NUM_TASKS; i++) {
+    if (tids[i] != includedtids[i]) {
+      error++;
+    }
+  }
+  return (error==0);
+} /* end of check_paralel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_final()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
+
diff --git a/final/runtime/test/tasking/omp_task_firstprivate.c b/final/runtime/test/tasking/omp_task_firstprivate.c
new file mode 100644
index 0000000..d1f7c35
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_firstprivate.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_task_firstprivate()
+{
+  int i;
+  int sum = 1234;
+  int known_sum;
+  int result = 0; /* counts the wrong sums from tasks */
+
+  known_sum = 1234 + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task firstprivate(sum)
+        {
+          int j;
+          for (j = 0; j <= LOOPCOUNT; j++) {
+            #pragma omp flush
+            sum += j;
+          }
+
+          /* check if calculated sum was right */
+          if (sum != known_sum) {
+            #pragma omp critical
+            { result++; }
+          }
+        } /* omp task */
+      } /* for loop */
+    } /* omp single */
+  } /* omp parallel */
+  return (result == 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_if.c b/final/runtime/test/tasking/omp_task_if.c
new file mode 100644
index 0000000..8b4728e
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_if.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task_if()
+{
+  int condition_false;
+  int count;
+  int result;
+
+  count=0;
+  condition_false = (count == 1);
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      #pragma omp task if (condition_false) shared(count, result)
+      {
+        my_sleep (SLEEPTIME);
+        #pragma omp critical
+        result = (0 == count);
+      } /* end of omp task */
+      #pragma omp critical
+      count = 1;
+    } /* end of single */
+  } /*end of parallel */
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_imp_firstprivate.c b/final/runtime/test/tasking/omp_task_imp_firstprivate.c
new file mode 100644
index 0000000..905ab9a
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_imp_firstprivate.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_imp_firstprivate()
+{
+  int i=5;
+  int k = 0;
+  int result = 0;
+  int task_result = 1;
+  #pragma omp parallel firstprivate(i)
+  {
+    #pragma omp single
+    {
+      for (k = 0; k < NUM_TASKS; k++) {
+        #pragma omp task shared(result , task_result)
+        {
+          int j;
+          //check if i is private
+          if(i != 5)
+            task_result = 0;
+          for(j = 0; j < NUM_TASKS; j++)
+            i++;
+          //this should be firstprivate implicitly
+        }
+      }
+      #pragma omp taskwait
+      result = (task_result && i==5);
+    }
+  }
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_imp_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_priority.c b/final/runtime/test/tasking/omp_task_priority.c
new file mode 100644
index 0000000..7b62360
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_priority.c
@@ -0,0 +1,22 @@
+// RUN: %libomp-compile && env OMP_MAX_TASK_PRIORITY=42 %libomp-run
+// Test OMP 4.5 task priorities
+// Currently only API function and envirable parsing implemented.
+// Test environment sets envirable: OMP_MAX_TASK_PRIORITY=42 as tested below.
+#include <stdio.h>
+#include <omp.h>
+
+int main (void) {
+    int passed;
+
+    passed = (omp_get_max_task_priority() == 42);
+    printf("Got %d\n", omp_get_max_task_priority());
+
+    if (passed) {
+       printf("passed\n");
+       return 0;
+    }
+
+    printf("failed\n");
+    return 1;
+}
+
diff --git a/final/runtime/test/tasking/omp_task_private.c b/final/runtime/test/tasking/omp_task_private.c
new file mode 100644
index 0000000..7a93716
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_private.c
@@ -0,0 +1,53 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_private()
+{
+  int i;
+  int known_sum;
+  int sum = 0;
+  int result = 0; /* counts the wrong sums from tasks */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task private(sum) shared(result, known_sum)
+        {
+          int j;
+          //if sum is private, initialize to 0
+          sum = 0;
+          for (j = 0; j <= LOOPCOUNT; j++) {
+            #pragma omp flush
+            sum += j;
+          }
+          /* check if calculated sum was right */
+          if (sum != known_sum) {
+            #pragma omp critical
+            result++;
+          }
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel*/
+  return (result == 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_shared.c b/final/runtime/test/tasking/omp_task_shared.c
new file mode 100644
index 0000000..0304026
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_shared.c
@@ -0,0 +1,41 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_imp_shared()
+{
+  int i;
+  int k = 0;
+  int result = 0;
+  i=0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    for (k = 0; k < NUM_TASKS; k++) {
+      #pragma omp task shared(i)
+      {
+        #pragma omp atomic
+        i++;
+        //this should be shared implicitly
+      }
+    }
+  }
+  result = i;
+  return ((result == NUM_TASKS));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_imp_shared()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskloop_grainsize.c b/final/runtime/test/tasking/omp_taskloop_grainsize.c
new file mode 100644
index 0000000..0833073
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskloop_grainsize.c
@@ -0,0 +1,113 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+// GCC 6 has support for taskloops, but at least 6.3.0 is crashing on this test
+// UNSUPPORTED: gcc-6
+
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_grainsize()
+{
+  int result = 0;
+  int i, grainsize, count, tmp_count, num_off;
+  int *tmp, *tids, *tidsArray;
+
+  tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+  tids = tidsArray;
+
+  for (grainsize = 1; grainsize < 48; ++grainsize) {
+    fprintf(stderr, "Grainsize %d\n", grainsize);
+    count = tmp_count = num_off = 0;
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      tids[i] = -1;
+    }
+
+    #pragma omp parallel shared(tids)
+    {
+      #pragma omp master
+      #pragma omp taskloop grainsize(grainsize)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      if (tids[i] == -1) {
+        fprintf(stderr, "  Iteration %d not touched!\n", i);
+        result++;
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    tmp = (int *)malloc(sizeof(int) * (count + 1));
+    tmp[0] = 1;
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tmp_count > count) {
+        printf("--------------------\nTestinternal Error: List too "
+               "small!!!\n--------------------\n");
+        break;
+      }
+      if (tids[i] != tids[i + 1]) {
+        tmp_count++;
+        tmp[tmp_count] = 1;
+      } else {
+        tmp[tmp_count]++;
+      }
+    }
+
+    // is grainsize statement working?
+    int num_tasks = CFDMAX_SIZE / grainsize;
+    int multiple1 = CFDMAX_SIZE / num_tasks;
+    int multiple2 = CFDMAX_SIZE / num_tasks + 1;
+    for (i = 0; i < count; i++) {
+      // it is possible for 2 adjacent chunks assigned to a same thread
+      if (tmp[i] % multiple1 != 0 && tmp[i] % multiple2 != 0) {
+        num_off++;
+      }
+    }
+
+    if (num_off > 1) {
+      fprintf(stderr, "  The number of bad chunks is %d\n", num_off);
+      result++;
+    } else {
+      fprintf(stderr, "  Everything ok\n");
+    }
+
+    free(tmp);
+  }
+  free(tidsArray);
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_grainsize()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskloop_num_tasks.c b/final/runtime/test/tasking/omp_taskloop_num_tasks.c
new file mode 100644
index 0000000..7c3c704
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskloop_num_tasks.c
@@ -0,0 +1,71 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_num_tasks()
+{
+  int i;
+  int *tids;
+  int *tidsArray;
+  int count;
+  int result = 0;
+  int num_tasks;
+
+  for (num_tasks = 1; num_tasks < 120; ++num_tasks) {
+    count = 0;
+    tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+    tids = tidsArray;
+
+    #pragma omp parallel shared(tids)
+    {
+      int i;
+      #pragma omp master
+      #pragma omp taskloop num_tasks(num_tasks)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    if (count > num_tasks) {
+      fprintf(stderr, "counted too many tasks: (wanted %d, got %d)\n",
+              num_tasks, count);
+      result++;
+    }
+  }
+
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_num_tasks()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskwait.c b/final/runtime/test/tasking/omp_taskwait.c
new file mode 100644
index 0000000..c3a0ea7
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskwait.c
@@ -0,0 +1,74 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_taskwait()
+{
+  int result1 = 0;   /* Stores number of not finished tasks after the taskwait */
+  int result2 = 0;   /* Stores number of wrong array elements at the end */
+  int array[NUM_TASKS];
+  int i;
+
+  /* fill array */
+  for (i = 0; i < NUM_TASKS; i++)
+    array[i] = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          my_sleep (SLEEPTIME);
+          array[myi] = 1;
+        } /* end of omp task */
+      } /* end of for */
+      #pragma omp taskwait
+      /* check if all tasks were finished */
+      for (i = 0; i < NUM_TASKS; i++)
+        if (array[i] != 1)
+          result1++;
+
+      /* generate some more tasks which now shall overwrite
+       * the values in the tids array */
+      for (i = 0; i < NUM_TASKS; i++) {
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          array[myi] = 2;
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* final check, if all array elements contain the right values: */
+  for (i = 0; i < NUM_TASKS; i++) {
+    if (array[i] != 2)
+      result2++;
+  }
+  return ((result1 == 0) && (result2 == 0));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_taskwait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskyield.c b/final/runtime/test/tasking/omp_taskyield.c
new file mode 100644
index 0000000..5bb6984
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskyield.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_taskyield()
+{
+  int i;
+  int count = 0;
+  int start_tid[NUM_TASKS];
+  int current_tid[NUM_TASKS];
+
+  for (i=0; i< NUM_TASKS; i++) {
+    start_tid[i]=0;
+    current_tid[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        int myi = i;
+        #pragma omp task untied
+        {
+          my_sleep(SLEEPTIME);
+          start_tid[myi] = omp_get_thread_num();
+          #pragma omp taskyield
+          if((start_tid[myi] %2) ==0){
+            my_sleep(SLEEPTIME);
+            current_tid[myi] = omp_get_thread_num();
+          } /*end of if*/
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel */
+  for (i=0;i<NUM_TASKS; i++) {
+    //printf("start_tid[%d]=%d, current_tid[%d]=%d\n",
+      //i, start_tid[i], i , current_tid[i]);
+    if (current_tid[i] == start_tid[i])
+      count++;
+  }
+  return (count<NUM_TASKS);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_taskyield()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}