181 files changed, 15167 insertions, 0 deletions
diff --git a/final/runtime/test/CMakeLists.txt b/final/runtime/test/CMakeLists.txt
new file mode 100644
index 0000000..8f367c5
--- /dev/null
+++ b/final/runtime/test/CMakeLists.txt
@@ -0,0 +1,37 @@
+# CMakeLists.txt file for unit testing OpenMP host runtime library.
+include(CheckFunctionExists)
+include(CheckLibraryExists)
+
+# Some tests use math functions
+check_library_exists(m sqrt "" LIBOMP_HAVE_LIBM)
+# When using libgcc, -latomic may be needed for atomics
+# (but when using compiler-rt, the atomics will be built-in)
+# Note: we can not check for __atomic_load because clang treats it
+# as special built-in and that breaks CMake checks
+check_function_exists(__atomic_load_1 LIBOMP_HAVE_BUILTIN_ATOMIC)
+if(NOT LIBOMP_HAVE_BUILTIN_ATOMIC)
+  check_library_exists(atomic __atomic_load_1 "" LIBOMP_HAVE_LIBATOMIC)
+else()
+  # not needed
+  set(LIBOMP_HAVE_LIBATOMIC 0)
+endif()
+
+macro(pythonize_bool var)
+  if (${var})
+    set(${var} True)
+  else()
+    set(${var} False)
+  endif()
+endmacro()
+
+pythonize_bool(LIBOMP_USE_HWLOC)
+pythonize_bool(LIBOMP_OMPT_SUPPORT)
+pythonize_bool(LIBOMP_OMPT_OPTIONAL)
+pythonize_bool(LIBOMP_HAVE_LIBM)
+pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
+
+add_openmp_testsuite(check-libomp "Running libomp tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omp)
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomp configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/final/runtime/test/api/has_openmp.c b/final/runtime/test/api/has_openmp.c
new file mode 100644
index 0000000..da95f59
--- /dev/null
+++ b/final/runtime/test/api/has_openmp.c
@@ -0,0 +1,23 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+int test_has_openmp()
+{
+  int rvalue = 0;
+#ifdef _OPENMP
+  rvalue = 1;
+#endif
+  return (rvalue);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+  if(!test_has_openmp()) {
+    num_failed++;
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/kmp_aligned_malloc.c b/final/runtime/test/api/kmp_aligned_malloc.c
new file mode 100644
index 0000000..5302fec
--- /dev/null
+++ b/final/runtime/test/api/kmp_aligned_malloc.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdint.h>
+#include <omp.h>
+#include "omp_testsuite.h"
+
+int alignments[] = {64, 128, 256, 512, 1024, 2048, 4096};
+
+unsigned aligned_by(uint64_t addr) {
+    uint64_t alignment = 1;
+    while((addr & (alignment-1)) == 0) {
+        alignment <<= 1;
+    }
+    return (alignment >> 1);
+}
+
+int test_kmp_aligned_malloc()
+{
+  int err = 0;
+  #pragma omp parallel shared(err)
+  {
+    int i;
+    int* ptr;
+    uint64_t addr;
+    int tid = omp_get_thread_num();
+
+    for(i = 0; i < sizeof(alignments)/sizeof(int); i++) {
+      int alignment = alignments[i];
+      // allocate 64 bytes with 64-byte alignment
+      // allocate 128 bytes with 128-byte alignment, etc.
+      ptr = (int*)kmp_aligned_malloc(alignment, alignment);
+      addr = (uint64_t)ptr;
+      if(addr & (alignment-1)) {
+        printf("thread %d: addr = %p (aligned to %u bytes) but expected "
+               " alignment = %d\n", tid, ptr, aligned_by(addr), alignment);
+        err = 1;
+      }
+      kmp_free(ptr);
+    }
+
+    ptr = kmp_aligned_malloc(128, 127);
+    if (ptr != NULL) {
+      printf("thread %d: kmp_aligned_malloc() didn't return NULL when "
+             "alignment was not power of 2\n", tid);
+      err = 1;
+    }
+  } /* end of parallel */
+  return !err;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_aligned_malloc()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/kmp_set_defaults_lock_bug.c b/final/runtime/test/api/kmp_set_defaults_lock_bug.c
new file mode 100644
index 0000000..73a7afb
--- /dev/null
+++ b/final/runtime/test/api/kmp_set_defaults_lock_bug.c
@@ -0,0 +1,53 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+/* The bug occurs if the lock table is reallocated after
+   kmp_set_defaults() is called.  If the table is reallocated,
+   then the lock will not point to a valid lock object after the
+   kmp_set_defaults() call.*/
+omp_lock_t lock;
+
+int test_kmp_set_defaults_lock_bug()
+{
+  /* checks that omp_get_num_threads is equal to the number of
+     threads */
+  int nthreads_lib;
+  int nthreads = 0;
+
+  nthreads_lib = -1;
+
+  #pragma omp parallel
+  {
+    omp_set_lock(&lock);
+    nthreads++;
+    omp_unset_lock(&lock);
+    #pragma omp single
+    {
+      nthreads_lib = omp_get_num_threads ();
+    }  /* end of single */
+  } /* end of parallel */
+  kmp_set_defaults("OMP_NUM_THREADS");
+  #pragma omp parallel
+  {
+    omp_set_lock(&lock);
+    nthreads++;
+    omp_unset_lock(&lock);
+  } /* end of parallel */
+
+  return (nthreads == 2*nthreads_lib);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+  omp_init_lock(&lock);
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_set_defaults_lock_bug()) {
+      num_failed++;
+    }
+  }
+  omp_destroy_lock(&lock);
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_get_num_threads.c b/final/runtime/test/api/omp_get_num_threads.c
new file mode 100644
index 0000000..daf286d
--- /dev/null
+++ b/final/runtime/test/api/omp_get_num_threads.c
@@ -0,0 +1,39 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_get_num_threads()
+{
+  /* checks that omp_get_num_threads is equal to the number of
+     threads */
+  int nthreads_lib;
+  int nthreads = 0;
+
+  nthreads_lib = -1;
+
+  #pragma omp parallel
+  {
+    #pragma omp critical
+    {
+      nthreads++;
+    } /* end of critical */
+    #pragma omp single
+    {
+      nthreads_lib = omp_get_num_threads ();
+    }  /* end of single */
+  } /* end of parallel */
+  return (nthreads == nthreads_lib);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_get_num_threads()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_get_wtick.c b/final/runtime/test/api/omp_get_wtick.c
new file mode 100644
index 0000000..8b35226
--- /dev/null
+++ b/final/runtime/test/api/omp_get_wtick.c
@@ -0,0 +1,24 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_get_wtick()
+{
+  double tick;
+  tick = -1.;
+  tick = omp_get_wtick ();
+  return ((tick > 0.0) && (tick < 0.01));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_get_wtick()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_get_wtime.c b/final/runtime/test/api/omp_get_wtime.c
new file mode 100644
index 0000000..b309440
--- /dev/null
+++ b/final/runtime/test/api/omp_get_wtime.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_get_wtime()
+{
+  double start;
+  double end;
+  double measured_time;
+  double wait_time = 5.0;
+  start = 0;
+  end = 0;
+  start = omp_get_wtime();
+  my_sleep (wait_time);
+  end = omp_get_wtime();
+  measured_time = end-start;
+  return ((measured_time > 0.97 * wait_time) && (measured_time < 1.03 * wait_time)) ;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_get_wtime()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_in_parallel.c b/final/runtime/test/api/omp_in_parallel.c
new file mode 100644
index 0000000..d09313e
--- /dev/null
+++ b/final/runtime/test/api/omp_in_parallel.c
@@ -0,0 +1,39 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * Checks that false is returned when called from serial region
+ * and true is returned when called within parallel region.
+ */
+int test_omp_in_parallel()
+{
+  int serial;
+  int isparallel;
+
+  serial = 1;
+  isparallel = 0;
+  serial = omp_in_parallel();
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      isparallel = omp_in_parallel();
+    }
+  }
+  return (!(serial) && isparallel);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_in_parallel()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/atomic/omp_atomic.c b/final/runtime/test/atomic/omp_atomic.c
new file mode 100644
index 0000000..7cdd30d
--- /dev/null
+++ b/final/runtime/test/atomic/omp_atomic.c
@@ -0,0 +1,366 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20  /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_atomic()
+{
+  int sum;
+  int diff;
+  double dsum = 0;
+  double dt = 0.5;  /* base of geometric row for + and - test*/
+  double ddiff;
+  int product;
+  int x;
+  int *logics;
+  int bit_and = 1;
+  int bit_or = 0;
+  int exclusiv_bit_or = 0;
+  int j;
+  int known_sum;
+  int known_diff;
+  int known_product;
+  int result = 0;
+  int logic_and = 1;
+  int logic_or = 0;
+  double dknown_sum;
+  double rounding_error = 1.E-9;
+  double dpt, div;
+  int logicsArray[LOOPCOUNT];
+  logics = logicsArray;
+
+  sum = 0;
+  diff = 0;
+  product = 1;
+
+  // sum of integers test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      #pragma omp atomic
+      sum += i;
+    }
+
+  }
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  if (known_sum != sum)
+  {
+    fprintf(stderr,
+      "Error in sum with integers: Result was %d instead of %d.\n",
+      sum, known_sum);
+    result++;
+  }
+
+  // difference of integers test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; i++) {
+      #pragma omp atomic
+      diff -= i;
+    }
+  }
+  known_diff = ((LOOPCOUNT - 1) * LOOPCOUNT) / 2 * -1;
+  if (diff != known_diff)
+  {
+    fprintf (stderr,
+      "Error in difference with integers: Result was %d instead of 0.\n",
+      diff);
+    result++;
+  }
+
+  // sum of doubles test
+  dsum = 0;
+  dpt = 1;
+  for (j = 0; j < DOUBLE_DIGITS; ++j) {
+    dpt *= dt;
+  }
+  dknown_sum = (1 - dpt) / (1 -dt);
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < DOUBLE_DIGITS; ++i) {
+      #pragma omp atomic
+      dsum += pow (dt, i);
+    }
+  }
+  if (dsum != dknown_sum && (fabs (dsum - dknown_sum) > rounding_error)) {
+    fprintf (stderr, "Error in sum with doubles: Result was %f"
+      " instead of: %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum - dknown_sum);
+    result++;
+  }
+
+  // difference of doubles test
+  dpt = 1;
+  for (j = 0; j < DOUBLE_DIGITS; ++j) {
+    dpt *= dt;
+  }
+  ddiff = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < DOUBLE_DIGITS; ++i) {
+      #pragma omp atomic
+      ddiff -= pow (dt, i);
+    }
+  }
+  if (fabs (ddiff) > rounding_error) {
+    fprintf (stderr,
+      "Error in difference with doubles: Result was %E instead of 0.0\n",
+      ddiff);
+    result++;
+  }
+
+  // product of integers test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 1; i <= MAX_FACTOR; i++) {
+      #pragma omp atomic
+      product *= i;
+    }
+  }
+  known_product = KNOWN_PRODUCT;
+  if (known_product != product) {
+    fprintf (stderr,
+      "Error in product with integers: Result was %d instead of %d\n",
+      product, known_product);
+    result++;
+  }
+
+  // division of integers test
+  product = KNOWN_PRODUCT;
+  #pragma omp parallel
+  {
+     int i;
+    #pragma omp for
+    for (i = 1; i <= MAX_FACTOR; ++i) {
+      #pragma omp atomic
+      product /= i;
+    }
+  }
+  if (product != 1) {
+    fprintf (stderr,
+      "Error in product division with integers: Result was %d"
+      " instead of 1\n",
+      product);
+    result++;
+  }
+
+  // division of doubles test
+  div = 5.0E+5;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 1; i <= MAX_FACTOR; i++) {
+      #pragma omp atomic
+      div /= i;
+    }
+  }
+  if (fabs(div-0.137787) >= 1.0E-4 ) {
+    result++;
+    fprintf (stderr, "Error in division with double: Result was %f"
+      " instead of 0.137787\n", div);
+  }
+
+  // ++ test
+  x = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      x++;
+    }
+  }
+  if (x != LOOPCOUNT) {
+    result++;
+    fprintf (stderr, "Error in ++\n");
+  }
+
+  // -- test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      x--;
+    }
+  }
+  if (x != 0) {
+    result++;
+    fprintf (stderr, "Error in --\n");
+  }
+
+  // bit-and test part 1
+  for (j = 0; j < LOOPCOUNT; ++j) {
+    logics[j] = 1;
+  }
+  bit_and = 1;
+  #pragma omp parallel
+  {
+     int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_and &= logics[i];
+    }
+  }
+  if (!bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 1\n");
+  }
+
+  // bit-and test part 2
+  bit_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_and &= logics[i];
+    }
+  }
+  if (bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 2\n");
+  }
+
+  // bit-or test part 1
+  for (j = 0; j < LOOPCOUNT; j++) {
+    logics[j] = 0;
+  }
+  bit_or = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_or |= logics[i];
+    }
+  }
+  if (bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 1\n");
+  }
+
+  // bit-or test part 2
+  bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+  #pragma omp parallel
+  {
+
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_or |= logics[i];
+    }
+  }
+  if (!bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 2\n");
+  }
+
+  // bit-xor test part 1
+  for (j = 0; j < LOOPCOUNT; j++) {
+    logics[j] = 0;
+  }
+  exclusiv_bit_or = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      exclusiv_bit_or ^= logics[i];
+    }
+  }
+  if (exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  // bit-xor test part 2
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      exclusiv_bit_or ^= logics[i];
+    }
+
+  }
+  if (!exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  // left shift test
+  x = 1;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < 10; ++i) {
+      #pragma omp atomic
+      x <<= 1;
+    }
+
+  }
+  if ( x != 1024) {
+    result++;
+    fprintf (stderr, "Error in <<\n");
+    x = 1024;
+  }
+
+  // right shift test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < 10; ++i) {
+      #pragma omp atomic
+      x >>= 1;
+    }
+  }
+  if (x != 1) {
+    result++;
+    fprintf (stderr, "Error in >>\n");
+  }
+
+  return (result == 0);
+} // test_omp_atomic()
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_atomic()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/barrier/omp_barrier.c b/final/runtime/test/barrier/omp_barrier.c
new file mode 100644
index 0000000..a3fb060
--- /dev/null
+++ b/final/runtime/test/barrier/omp_barrier.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_barrier()
+{
+  int result1;
+  int result2;
+  result1 = 0;
+  result2 = 0;
+
+  #pragma omp parallel
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    if (rank ==1) {
+      my_sleep(((double)SLEEPTIME)/REPETITIONS); // give 1 sec to whole test
+      result2 = 3;
+    }
+    #pragma omp barrier
+    if (rank == 2) {
+      result1 = result2;
+    }
+  }
+  return (result1 == 3);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+#ifdef _OPENMP
+  omp_set_dynamic(0); // prevent runtime to change number of threads
+  omp_set_num_threads(4); // the test expects at least 3 threads
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_barrier()) {
+      num_failed++;
+    }
+  }
+#endif
+  return num_failed;
+}
diff --git a/final/runtime/test/critical/omp_critical.c b/final/runtime/test/critical/omp_critical.c
new file mode 100644
index 0000000..e07dbcb
--- /dev/null
+++ b/final/runtime/test/critical/omp_critical.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_critical()
+{
+  int sum;
+  int known_sum;
+
+  sum=0;
+  #pragma omp parallel
+  {
+    int mysum=0;
+    int i;
+    #pragma omp for
+    for (i = 0; i < 1000; i++)
+      mysum = mysum + i;
+
+    #pragma omp critical
+    sum = mysum +sum;
+  }
+  known_sum = 999 * 1000 / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_critical()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/env/kmp_aff_disable_hwloc.c b/final/runtime/test/env/kmp_aff_disable_hwloc.c
new file mode 100644
index 0000000..5f848ac
--- /dev/null
+++ b/final/runtime/test/env/kmp_aff_disable_hwloc.c
@@ -0,0 +1,21 @@
+// RUN: %libomp-compile && env KMP_AFFINITY=disabled KMP_TOPOLOGY_METHOD=hwloc %libomp-run
+// REQUIRES: hwloc
+#include <stdio.h>
+#include <stdlib.h>
+
+// Test will assert() without fix
+int test_affinity_disabled_plus_hwloc() {
+  #pragma omp parallel
+  {}
+  return 1;
+}
+
+int main(int argc, char **argv) {
+  int i, j;
+  int failed = 0;
+
+  if (!test_affinity_disabled_plus_hwloc()) {
+    failed = 1;
+  }
+  return failed;
+}
diff --git a/final/runtime/test/env/kmp_set_dispatch_buf.c b/final/runtime/test/env/kmp_set_dispatch_buf.c
new file mode 100644
index 0000000..49eb7b5
--- /dev/null
+++ b/final/runtime/test/env/kmp_set_dispatch_buf.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile && env KMP_DISP_NUM_BUFFERS=0 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=1 %libomp-run && env KMP_DISP_NUM_BUFFERS=3 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=4 %libomp-run && env KMP_DISP_NUM_BUFFERS=7 %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && env KMP_DISP_NUM_BUFFERS=1 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=3 %libomp-run && env KMP_DISP_NUM_BUFFERS=4 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=7 %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 7
+#define MY_MAX 200
+#define MY_MIN -200
+#define NUM_LOOPS 100
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE dynamic
+#endif
+
+int a, b, a_known_value, b_known_value;
+
+int test_kmp_set_disp_num_buffers()
+{
+  int success = 1;
+  a = 0;
+  b = 0;
+  // run many small dynamic loops to stress the dispatch buffer system
+  #pragma omp parallel
+  {
+    int i,j;
+    for (j = 0; j < NUM_LOOPS; j++) {
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+      }
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+      }
+    }
+  }
+  // detect failure
+  if (a != a_known_value || b != b_known_value) {
+    success = 0;
+    printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value,
+           b, b_known_value);
+  }
+  return success;
+}
+
+int main(int argc, char** argv)
+{
+  int i,j;
+  int num_failed=0;
+
+  // figure out the known values to compare with calculated result
+  a_known_value = 0;
+  b_known_value = 0;
+
+  for (j = 0; j < NUM_LOOPS; j++) {
+    for (i = MY_MIN; i < MY_MAX; i+=INCR)
+      a_known_value++;
+    for (i = MY_MAX; i >= MY_MIN; i-=INCR)
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_set_disp_num_buffers()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/env/omp_thread_limit.c b/final/runtime/test/env/omp_thread_limit.c
new file mode 100644
index 0000000..800edc4
--- /dev/null
+++ b/final/runtime/test/env/omp_thread_limit.c
@@ -0,0 +1,82 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=4 %libomp-run 4
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=7 %libomp-run 7
+//
+// OMP_THREAD_LIMIT=N should imply that no more than N threads are active in
+// a contention group
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+int failed = 0;
+
+void usage() {
+    fprintf(stderr, "usage: omp_thread_limit <n>\n");
+}
+
+void verify(const char* file_name, int line_number, int team_size) {
+  int num_threads = omp_get_num_threads();
+  if (team_size != num_threads) {
+#pragma omp critical(A)
+    {
+      char label[256];
+      snprintf(label, sizeof(label), "%s:%d", file_name, line_number);
+      failed = 1;
+      printf("failed: %s: team_size(%d) != omp_get_num_threads(%d)\n",
+             label, team_size, num_threads);
+    }
+  }
+}
+
+int main(int argc, char** argv)
+{
+  int cl_thread_limit;
+
+  if (argc != 2) {
+    usage();
+    return 1;
+  }
+  cl_thread_limit = atoi(argv[1]);
+
+  omp_set_dynamic(0);
+  if (omp_get_thread_limit() != cl_thread_limit) {
+    fprintf(stderr, "omp_get_thread_limit failed with %d, should be%d\n",
+            omp_get_thread_limit(), cl_thread_limit);
+    return 1;
+  }
+  else if (omp_get_max_threads() > cl_thread_limit) {
+#if _OPENMP
+    int team_size = cl_thread_limit;
+#else
+    int team_size = 1;
+#endif
+    omp_set_num_threads(19);
+    verify(__FILE__, __LINE__, 1);
+#pragma omp parallel
+    {
+      verify(__FILE__, __LINE__, team_size);
+      verify(__FILE__, __LINE__, team_size);
+    }
+    verify(__FILE__, __LINE__, 1);
+
+    omp_set_nested(1);
+#pragma omp parallel num_threads(3)
+    {
+      verify(__FILE__, __LINE__, 3);
+#pragma omp master
+#pragma omp parallel num_threads(21)
+      {
+        verify(__FILE__, __LINE__, team_size-2);
+        verify(__FILE__, __LINE__, team_size-2);
+      }
+    }
+    verify(__FILE__, __LINE__, 1);
+
+    return failed;
+  } else {
+    fprintf(stderr, "This test is not applicable for max num_threads='%d'\n",
+            omp_get_max_threads());
+    return 0;
+  }
+
+}
diff --git a/final/runtime/test/env/omp_wait_policy.c b/final/runtime/test/env/omp_wait_policy.c
new file mode 100644
index 0000000..b260ce4
--- /dev/null
+++ b/final/runtime/test/env/omp_wait_policy.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile && env OMP_WAIT_POLICY=active %libomp-run active
+// RUN: %libomp-compile && env OMP_WAIT_POLICY=passive %libomp-run passive
+//
+// OMP_WAIT_POLICY=active should imply blocktime == INT_MAX
+// i.e., threads spin-wait forever
+// OMP_WAIT_POLICY=passive should imply blocktime == 0
+// i.e., threads immediately sleep
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+void usage() {
+    fprintf(stderr, "usage: omp_wait_policy active|passive\n");
+}
+
+int main(int argc, char** argv)
+{
+  int blocktime, retval=1;
+  const char* env_var_value;
+
+  if (argc != 2) {
+    usage();
+    return 1;
+  }
+
+  blocktime = kmp_get_blocktime();
+
+  env_var_value = argv[1];
+  if (!strcmp(env_var_value, "active")) {
+    retval = (blocktime != INT_MAX);
+  } else if (!strcmp(env_var_value, "passive")) {
+    retval = (blocktime != 0);
+  } else {
+    usage();
+    retval = 1;
+  }
+
+  return retval;
+}
diff --git a/final/runtime/test/flush/omp_flush.c b/final/runtime/test/flush/omp_flush.c
new file mode 100644
index 0000000..3fd3cdf
--- /dev/null
+++ b/final/runtime/test/flush/omp_flush.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_flush()
+{
+  int result1;
+  int result2;
+  int dummy;
+
+  result1 = 0;
+  result2 = 0;
+
+  #pragma omp parallel
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    #pragma omp barrier
+    if (rank == 1) {
+      result2 = 3;
+      #pragma omp flush (result2)
+      dummy = result2;
+    }
+    if (rank == 0) {
+      my_sleep(SLEEPTIME);
+      #pragma omp flush (result2)
+      result1 = result2;
+    }
+  }  /* end of parallel */
+  return ((result1 == result2) && (result2 == dummy) && (result2 == 3));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_flush()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lit.cfg b/final/runtime/test/lit.cfg
new file mode 100644
index 0000000..e4561eb
--- /dev/null
+++ b/final/runtime/test/lit.cfg
@@ -0,0 +1,130 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import re
+import subprocess
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def append_dynamic_library_path(path):
+    if config.operating_system == 'Windows':
+        name = 'PATH'
+        sep = ';'
+    elif config.operating_system == 'Darwin':
+        name = 'DYLD_LIBRARY_PATH'
+        sep = ':'
+    else:
+        name = 'LD_LIBRARY_PATH'
+        sep = ':'
+    if name in config.environment:
+        config.environment[name] = path + sep + config.environment[name]
+    else:
+        config.environment[name] = path
+
+# name: The name of this test suite.
+config.name = 'libomp'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.libomp_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.test_source_root + \
+    " -I " + config.omp_header_directory + \
+    " -L " + config.library_dir + \
+    " " + config.test_extra_flags
+
+# extra libraries
+libs = ""
+if config.has_libm:
+    libs += " -lm"
+if config.has_libatomic:
+    libs += " -latomic"
+
+# Allow XFAIL to work
+config.target_triple = [ ]
+for feature in config.test_compiler_features:
+    config.available_features.add(feature)
+
+# Setup environment to find dynamic library at runtime
+append_dynamic_library_path(config.library_dir)
+if config.using_hwloc:
+    append_dynamic_library_path(config.hwloc_library_dir)
+    config.available_features.add('hwloc')
+
+# Rpath modifications for Darwin
+if config.operating_system == 'Darwin':
+    config.test_flags += " -Wl,-rpath," + config.library_dir
+    if config.using_hwloc:
+        config.test_flags += " -Wl,-rpath," + config.hwloc_library_dir
+
+# Find the SDK on Darwin
+if config.operating_system == 'Darwin':
+  cmd = subprocess.Popen(['xcrun', '--show-sdk-path'],
+                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  out, err = cmd.communicate()
+  out = out.strip()
+  res = cmd.wait()
+  if res == 0 and out:
+    config.test_flags += " -isysroot " + out
+
+# Disable OMPT tests if FileCheck was not found
+if config.has_ompt and config.test_filecheck == "":
+    lit_config.note("Not testing OMPT because FileCheck was not found")
+    config.has_ompt = False
+
+if config.has_ompt:
+    config.available_features.add("ompt")
+    # for callback.h
+    config.test_flags += " -I " + config.test_source_root + "/ompt"
+
+if 'Linux' in config.operating_system:
+    config.available_features.add("linux")
+
+# to run with icc INTEL_LICENSE_FILE must be set
+if 'INTEL_LICENSE_FILE' in os.environ:
+    config.environment['INTEL_LICENSE_FILE'] = os.environ['INTEL_LICENSE_FILE']
+
+
+# substitutions
+config.substitutions.append(("%libomp-compile-and-run", \
+    "%libomp-compile && %libomp-run"))
+config.substitutions.append(("%libomp-cxx-compile-and-run", \
+    "%libomp-cxx-compile && %libomp-run"))
+config.substitutions.append(("%libomp-cxx-compile", \
+    "%clangXX %openmp_flags %flags -std=c++11 %s -o %t" + libs))
+config.substitutions.append(("%libomp-compile", \
+    "%clang %openmp_flags %flags %s -o %t" + libs))
+config.substitutions.append(("%libomp-run", "%t"))
+config.substitutions.append(("%clangXX", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%flags", config.test_flags))
+
+if config.has_ompt:
+    config.substitutions.append(("FileCheck", config.test_filecheck))
+    config.substitutions.append(("%sort-threads", "sort --numeric-sort --stable"))
+    if config.operating_system == 'Windows':
+        # No such environment variable on Windows.
+        config.substitutions.append(("%preload-tool", "true ||"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
+    elif config.operating_system == 'Darwin':
+        config.substitutions.append(("%preload-tool", "env DYLD_INSERT_LIBRARIES=%T/tool.so"))
+        # No such linker flag on Darwin.
+        config.substitutions.append(("%no-as-needed-flag", ""))
+    else:
+        config.substitutions.append(("%preload-tool", "env LD_PRELOAD=%T/tool.so"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
diff --git a/final/runtime/test/lit.site.cfg.in b/final/runtime/test/lit.site.cfg.in
new file mode 100644
index 0000000..c2825ee
--- /dev/null
+++ b/final/runtime/test/lit.site.cfg.in
@@ -0,0 +1,20 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
+config.test_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
+config.libomp_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMP_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMP_BINARY_DIR@/src"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
+config.using_hwloc = @LIBOMP_USE_HWLOC@
+config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
+config.has_libm = @LIBOMP_HAVE_LIBM@
+config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@LIBOMP_BASE_DIR@/test/lit.cfg")
diff --git a/final/runtime/test/lock/omp_init_lock.c b/final/runtime/test/lock/omp_init_lock.c
new file mode 100644
index 0000000..24b60d1
--- /dev/null
+++ b/final/runtime/test/lock/omp_init_lock.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run
+#include "omp_testsuite.h"
+#include <stdio.h>
+
+// This should be slightly less than KMP_I_LOCK_CHUNK, which is 1024
+#define LOCKS_PER_ITER 1000
+#define ITERATIONS (REPETITIONS + 1)
+
+// This tests concurrently using locks on one thread while initializing new
+// ones on another thread.  This exercises the global lock pool.
+int test_omp_init_lock() {
+  int i;
+  omp_lock_t lcks[ITERATIONS * LOCKS_PER_ITER];
+#pragma omp parallel for schedule(static) num_threads(NUM_TASKS)
+  for (i = 0; i < ITERATIONS; i++) {
+    int j;
+    omp_lock_t *my_lcks = &lcks[i * LOCKS_PER_ITER];
+    for (j = 0; j < LOCKS_PER_ITER; j++) {
+      omp_init_lock(&my_lcks[j]);
+    }
+    for (j = 0; j < LOCKS_PER_ITER * 100; j++) {
+      omp_set_lock(&my_lcks[j % LOCKS_PER_ITER]);
+      omp_unset_lock(&my_lcks[j % LOCKS_PER_ITER]);
+    }
+  }
+  // Wait until all repititions are done.  The test is exercising growth of
+  // the global lock pool, which does not shrink when no locks are allocated.
+  {
+    int j;
+    for (j = 0; j < ITERATIONS * LOCKS_PER_ITER; j++) {
+      omp_destroy_lock(&lcks[j]);
+    }
+  }
+
+  return 0;
+}
+
+int main() {
+  // No use repeating this test, since it's exercising a private global pool
+  // which is not reset between test iterations.
+  return test_omp_init_lock();
+}
diff --git a/final/runtime/test/lock/omp_lock.c b/final/runtime/test/lock/omp_lock.c
new file mode 100644
index 0000000..1301f27
--- /dev/null
+++ b/final/runtime/test/lock/omp_lock.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+// RUN: env KMP_LOCK_KIND=tas KMP_SPIN_BACKOFF_PARAMS=2048,200 %libomp-run
+// RUN: env KMP_LOCK_KIND=futex %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_lock_t lck;
+
+int test_omp_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_lock(&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for(i = 0; i < LOOPCOUNT; i++) {
+      omp_set_lock(&lck);
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_lock(&lck);
+    }
+  }
+  omp_destroy_lock(&lck);
+
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lock/omp_nest_lock.c b/final/runtime/test/lock/omp_nest_lock.c
new file mode 100644
index 0000000..33d7c6a
--- /dev/null
+++ b/final/runtime/test/lock/omp_nest_lock.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_nest_lock_t lck;
+
+int test_omp_nest_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_nest_lock(&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for(i = 0; i < LOOPCOUNT; i++) {
+      omp_set_nest_lock(&lck);
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_nest_lock(&lck);
+    }
+  }
+  omp_destroy_nest_lock(&lck);
+
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_nest_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lock/omp_test_lock.c b/final/runtime/test/lock/omp_test_lock.c
new file mode 100644
index 0000000..c512055
--- /dev/null
+++ b/final/runtime/test/lock/omp_test_lock.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+// RUN: env KMP_LOCK_KIND=tas %libomp-run
+// RUN: env KMP_LOCK_KIND=futex %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_lock_t lck;
+
+int test_omp_test_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_lock (&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; i++) {
+      while (!omp_test_lock (&lck))
+      {};
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_lock (&lck);
+    }
+  }
+  omp_destroy_lock(&lck);
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_test_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lock/omp_test_nest_lock.c b/final/runtime/test/lock/omp_test_nest_lock.c
new file mode 100644
index 0000000..2fa6fd2
--- /dev/null
+++ b/final/runtime/test/lock/omp_test_nest_lock.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+static omp_nest_lock_t lck;
+
+int test_omp_test_nest_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_nest_lock (&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+      /*omp_set_lock(&lck);*/
+      while(!omp_test_nest_lock (&lck))
+      {};
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_nest_lock (&lck);
+    }
+  }
+  omp_destroy_nest_lock (&lck);
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_test_nest_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/master/omp_master.c b/final/runtime/test/master/omp_master.c
new file mode 100644
index 0000000..1cc7f9e
--- /dev/null
+++ b/final/runtime/test/master/omp_master.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_master()
+{
+  int nthreads;
+  int executing_thread;
+
+  nthreads = 0;
+  executing_thread = -1;
+
+  #pragma omp parallel
+  {
+    #pragma omp master
+    {
+      #pragma omp critical
+      {
+        nthreads++;
+      }
+      executing_thread = omp_get_thread_num();
+    } /* end of master*/
+  } /* end of parallel*/
+  return ((nthreads == 1) && (executing_thread == 0));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_master()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/master/omp_master_3.c b/final/runtime/test/master/omp_master_3.c
new file mode 100644
index 0000000..2e9fdf8
--- /dev/null
+++ b/final/runtime/test/master/omp_master_3.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_master_3()
+{
+  int nthreads;
+  int executing_thread;
+  int tid_result = 0; /* counts up the number of wrong thread no. for
+               the master thread. (Must be 0) */
+  nthreads = 0;
+  executing_thread = -1;
+
+  #pragma omp parallel
+  {
+    #pragma omp master
+    {
+      int tid = omp_get_thread_num();
+      if (tid != 0) {
+        #pragma omp critical
+        { tid_result++; }
+      }
+      #pragma omp critical
+      {
+        nthreads++;
+      }
+      executing_thread = omp_get_thread_num ();
+    } /* end of master*/
+  } /* end of parallel*/
+  return ((nthreads == 1) && (executing_thread == 0) && (tid_result == 0));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_master_3()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/misc_bugs/cancellation_for_sections.c b/final/runtime/test/misc_bugs/cancellation_for_sections.c
new file mode 100644
index 0000000..07a61cb
--- /dev/null
+++ b/final/runtime/test/misc_bugs/cancellation_for_sections.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run
+// XFAIL: gcc
+// Clang had a bug until version 4.0.1 which resulted in a hang.
+// UNSUPPORTED: clang-3, clang-4.0.0
+
+// Regression test for a bug in cancellation to cover effect of `#pragma omp cancel`
+// in a loop construct, on sections construct.
+// Pass condition: Cancellation status from `for` does not persist
+// to `sections`.
+
+#include <stdio.h>
+#include <omp.h>
+
+int result[2] = {0, 0};
+
+void cq416850_for_sections() {
+
+    unsigned i;
+     // 1) loop
+    #pragma omp for
+    for (i = 0; i < 1; i++) {
+        result[0] = 1;
+        #pragma omp cancel for
+        result[0] = 2;
+    }
+
+//        printf("thread %d: result[0] = %d, result[1] = %d \n",  omp_get_thread_num(), result[0], result[1]);
+
+
+    // 2) sections
+    #pragma omp sections
+    {
+        #pragma omp section
+        {
+            result[1] = 1;
+            #pragma omp cancellation point sections
+            result[1] = 2;
+        }
+    }
+}
+
+int main(void) {
+    if(!omp_get_cancellation()) {
+        printf("Cancellation not enabled!\n");
+        return 2;
+    }
+
+    #pragma omp parallel num_threads(4)
+    {
+        cq416850_for_sections();
+    }
+
+    if (result[0] != 1 || result[1] != 2) {
+        printf("Incorrect values. "
+               "result[0] = %d (expected 1), "
+               "result[1] = %d (expected 2).\n",
+               result[0], result[1]);
+        printf("FAILED\n");
+        return 1;
+    }
+
+    printf("PASSED\n");
+    return 0;
+}
diff --git a/final/runtime/test/misc_bugs/many-microtask-args.c b/final/runtime/test/misc_bugs/many-microtask-args.c
new file mode 100644
index 0000000..d644515
--- /dev/null
+++ b/final/runtime/test/misc_bugs/many-microtask-args.c
@@ -0,0 +1,39 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+
+int main()
+{
+
+  int i;
+  int i1 = 0;
+  int i2 = 1;
+  int i3 = 2;
+  int i4 = 3;
+  int i5 = 4;
+  int i6 = 6;
+  int i7 = 7;
+  int i8 = 8;
+  int i9 = 9;
+  int i10 = 10;
+  int i11 = 11;
+  int i12 = 12;
+  int i13 = 13;
+  int i14 = 14;
+  int i15 = 15;
+  int i16 = 16;
+ 
+  int r = 0; 
+  #pragma omp parallel for firstprivate(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16) reduction(+:r)
+  for (i = 0; i < i16; i++) {
+    r += i + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + i10 + i11 + i12 + i13 + i14 + i15 + i16;
+  }
+
+  int rf = 2216;
+  if (r != rf) {
+    fprintf(stderr, "r should be %d but instead equals %d\n", rf, r);
+    return 1;
+  }
+
+  return 0;
+}
+
diff --git a/final/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c b/final/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c
new file mode 100644
index 0000000..4d70d47
--- /dev/null
+++ b/final/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c
@@ -0,0 +1,81 @@
+// RUN: %libomp-compile -lpthread && %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+#define NUM_THREADS 10
+
+/*
+ After hot teams were enabled by default, the library started using levels
+ kept in the team structure.  The levels are broken in case foreign thread
+ exits and puts its team into the pool which is then re-used by another foreign
+ thread. The broken behavior observed is when printing the levels for each
+ new team, one gets 1, 2, 1, 2, 1, 2, etc.  This makes the library believe that
+ every other team is nested which is incorrect.  What is wanted is for the
+ levels to be 1, 1, 1, etc.
+*/
+
+int a = 0;
+int level;
+
+typedef struct thread_arg_t {
+  int iterations;
+} thread_arg_t;
+
+void* thread_function(void* arg) {
+  int i;
+  thread_arg_t* targ = (thread_arg_t*)arg;
+  int iterations = targ->iterations;
+  #pragma omp parallel private(i)
+  {
+    // level should always be 1
+    #pragma omp single
+    level = omp_get_level();
+
+    #pragma omp for
+    for(i = 0; i < iterations; i++) {
+      #pragma omp atomic
+      a++;
+    }
+  }
+}
+
+int test_omp_team_reuse()
+{
+  int i;
+  int success = 1;
+  pthread_t thread[NUM_THREADS];
+  thread_arg_t thread_arg[NUM_THREADS];
+  // launch NUM_THREADS threads, one at a time to perform thread_function()
+  for(i = 0; i < NUM_THREADS; i++) {
+    thread_arg[i].iterations = i + 1;
+    pthread_create(thread+i, NULL, thread_function, thread_arg+i);
+    pthread_join(*(thread+i), NULL);
+    // level read in thread_function()'s parallel region should be 1
+    if(level != 1) {
+      fprintf(stderr, "error: for pthread %d level should be 1 but "
+                      "instead equals %d\n", i, level);
+      success = 0;
+    }
+  }
+  // make sure the for loop works
+  int known_sum = (NUM_THREADS * (NUM_THREADS+1)) / 2;
+  if(a != known_sum) {
+    fprintf(stderr, "a should be %d but instead equals %d\n", known_sum, a);
+    success = 0;
+  }
+  return success;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    a = 0;
+    if(!test_omp_team_reuse()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/misc_bugs/teams-no-par.c b/final/runtime/test/misc_bugs/teams-no-par.c
new file mode 100644
index 0000000..0ef8d9a
--- /dev/null
+++ b/final/runtime/test/misc_bugs/teams-no-par.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile-and-run
+//
+// The test checks the teams construct pseudocode executed on host
+//
+
+#include <stdio.h>
+#include <omp.h>
+
+#ifndef N_TEAMS
+#define N_TEAMS 4
+#endif
+#ifndef N_THR
+#define N_THR 3
+#endif
+
+static int err = 0;
+
+// Internal library staff to emulate compiler's code generation:
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} ident_t;
+
+static ident_t dummy_loc = {0, 2, 0, 0, ";dummyFile;dummyFunc;0;0;;"};
+
+int __kmpc_global_thread_num(void*);
+void __kmpc_push_num_teams(ident_t const*, int, int, int);
+void __kmpc_fork_teams(ident_t const*, int argc, void *microtask, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+// Outlined entry point:
+void foo(int *gtid, int *tid, int *nt)
+{ // start "serial" execution by master threads of each team
+  if ( nt ) {
+    printf(" team %d, param %d\n", omp_get_team_num(), *nt);
+  } else {
+    printf("ERROR: teams before parallel: gtid, tid: %d %d, bad pointer: %p\n", *gtid, *tid, nt);
+    err++;
+    return;
+  }
+}
+
+int main()
+{
+  int nt = 4;
+  int th = __kmpc_global_thread_num(NULL); // registers initial thread
+  __kmpc_push_num_teams(&dummy_loc, th, N_TEAMS, N_THR);
+  __kmpc_fork_teams(&dummy_loc, 1, &foo, &nt); // pass 1 shared parameter "nt"
+  if (err)
+    printf("failed with %d errors\n",err);
+  else
+    printf("passed\n");
+  return err;
+}
diff --git a/final/runtime/test/misc_bugs/teams-reduction.c b/final/runtime/test/misc_bugs/teams-reduction.c
new file mode 100644
index 0000000..6d7cd11
--- /dev/null
+++ b/final/runtime/test/misc_bugs/teams-reduction.c
@@ -0,0 +1,68 @@
+// RUN: %libomp-compile-and-run
+//
+// The test checks the teams construct with reduction executed on the host.
+//
+
+#include <stdio.h>
+#include <omp.h>
+
+#include <stdint.h>
+
+#ifndef N_TEAMS
+#define N_TEAMS 4
+#endif
+#ifndef N_THR
+#define N_THR 3
+#endif
+
+// Internal library stuff to emulate compiler's code generation:
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int32_t reserved_1;
+  int32_t flags;
+  int32_t reserved_2;
+  int32_t reserved_3;
+  char const *psource;
+} ident_t;
+
+static ident_t dummy_loc = {0, 2, 0, 0, ";dummyFile;dummyFunc;0;0;;"};
+
+typedef union {
+  // The global will be used as pointer, so we need to make sure that the
+  // compiler correctly aligns the global...
+  void *ptr;
+  int32_t data[8];
+} kmp_critical_name;
+kmp_critical_name crit;
+
+int32_t __kmpc_global_thread_num(ident_t *);
+void __kmpc_push_num_teams(ident_t *, int32_t global_tid, int32_t num_teams,
+                           int32_t num_threads);
+void __kmpc_fork_teams(ident_t *, int32_t argc, void *microtask, ...);
+int32_t __kmpc_reduce(ident_t *, int32_t global_tid, int32_t num_vars,
+                      size_t reduce_size, void *reduce_data, void *reduce_func,
+                      kmp_critical_name *lck);
+void __kmpc_end_reduce(ident_t *, int32_t global_tid, kmp_critical_name *lck);
+
+#ifdef __cplusplus
+}
+#endif
+
+// Outlined entry point:
+void outlined(int32_t *gtid, int32_t *tid) {
+  int32_t ret = __kmpc_reduce(&dummy_loc, *gtid, 0, 0, NULL, NULL, &crit);
+  __kmpc_end_reduce(&dummy_loc, *gtid, &crit);
+}
+
+int main() {
+  int32_t th = __kmpc_global_thread_num(NULL); // registers initial thread
+  __kmpc_push_num_teams(&dummy_loc, th, N_TEAMS, N_THR);
+  __kmpc_fork_teams(&dummy_loc, 0, &outlined);
+
+  // Test did not hang -> passed!
+  printf("passed\n");
+  return 0;
+}
diff --git a/final/runtime/test/omp_my_sleep.h b/final/runtime/test/omp_my_sleep.h
new file mode 100644
index 0000000..138d930
--- /dev/null
+++ b/final/runtime/test/omp_my_sleep.h
@@ -0,0 +1,33 @@
+#ifndef MY_SLEEP_H
+#define MY_SLEEP_H
+
+/*! Utility function to have a sleep function with better resolution and
+ *  which only stops one thread. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <time.h>
+
+#if defined(_WIN32)
+# include <windows.h>
+// Windows version of my_sleep() function
+static void my_sleep(double sleeptime) {
+  DWORD ms = (DWORD) (sleeptime * 1000.0);
+  Sleep(ms);
+}
+
+
+#else // _WIN32
+
+// Unices version of my_sleep() function
+static void my_sleep(double sleeptime) {
+  struct timespec ts;
+  ts.tv_sec = (time_t)sleeptime;
+  ts.tv_nsec = (long)((sleeptime - (double)ts.tv_sec) * 1E9);
+  nanosleep(&ts, NULL);
+}
+
+#endif // _WIN32
+
+#endif // MY_SLEEP_H
diff --git a/final/runtime/test/omp_testsuite.h b/final/runtime/test/omp_testsuite.h
new file mode 100644
index 0000000..eef5470
--- /dev/null
+++ b/final/runtime/test/omp_testsuite.h
@@ -0,0 +1,79 @@
+/* Global headerfile of the OpenMP Testsuite */
+
+#ifndef OMP_TESTSUITE_H
+#define OMP_TESTSUITE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+/* General                                                */
+/**********************************************************/
+#define LOOPCOUNT 1000 /* Number of iterations to slit amongst threads */
+#define REPETITIONS 10 /* Number of times to run each test */
+
+/* following times are in seconds */
+#define SLEEPTIME 1
+
+/* Definitions for tasks                                  */
+/**********************************************************/
+#define NUM_TASKS 25
+#define MAX_TASKS_PER_THREAD 5
+
+#ifdef  _WIN32
+// Windows versions of pthread_create() and pthread_join()
+# include <windows.h>
+typedef HANDLE pthread_t;
+
+// encapsulates the information about a pthread-callable function
+struct thread_func_info_t {
+  void* (*start_routine)(void*);
+  void* arg;
+};
+
+// call the void* start_routine(void*);
+static DWORD __thread_func_wrapper(LPVOID lpParameter) {
+  struct thread_func_info_t* function_information;
+  function_information = (struct thread_func_info_t*)lpParameter;
+  function_information->start_routine(function_information->arg);
+  free(function_information);
+  return 0;
+}
+
+// attr is ignored
+static int pthread_create(pthread_t *thread, void *attr,
+                          void *(*start_routine) (void *), void *arg) {
+  pthread_t pthread;
+  struct thread_func_info_t* info;
+  info = (struct thread_func_info_t*)malloc(sizeof(struct thread_func_info_t));
+  info->start_routine = start_routine;
+  info->arg = arg;
+  pthread = CreateThread(NULL, 0, __thread_func_wrapper, info, 0, NULL);
+  if (pthread == NULL) {
+    fprintf(stderr, "CreateThread() failed: Error #%u.\n", GetLastError());
+    exit(1);
+  }
+  *thread = pthread;
+  return 0;
+}
+// retval is ignored for now
+static int pthread_join(pthread_t thread, void **retval) {
+  int rc;
+  rc = WaitForSingleObject(thread, INFINITE);
+  if (rc == WAIT_FAILED) {
+    fprintf(stderr, "WaitForSingleObject() failed: Error #%u.\n",
+            GetLastError());
+    exit(1);
+  }
+  rc = CloseHandle(thread);
+  if (rc == 0) {
+    fprintf(stderr, "CloseHandle() failed: Error #%u.\n", GetLastError());
+    exit(1);
+  }
+  return 0;
+}
+#else
+# include <pthread.h>
+#endif
+
+#endif
diff --git a/final/runtime/test/ompt/callback.h b/final/runtime/test/ompt/callback.h
new file mode 100755
index 0000000..f1191ad
--- /dev/null
+++ b/final/runtime/test/ompt/callback.h
@@ -0,0 +1,764 @@
+#ifndef _BSD_SOURCE
+#define _BSD_SOURCE
+#endif
+#define _DEFAULT_SOURCE
+#include <stdio.h>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+#include <omp.h>
+#include <ompt.h>
+#include "ompt-signal.h"
+
+// Used to detect architecture
+#include "../../src/kmp_platform.h"
+
+static const char* ompt_thread_type_t_values[] = {
+  NULL,
+  "ompt_thread_initial",
+  "ompt_thread_worker",
+  "ompt_thread_other"
+};
+
+static const char* ompt_task_status_t_values[] = {
+  NULL,
+  "ompt_task_complete",
+  "ompt_task_yield",
+  "ompt_task_cancel",
+  "ompt_task_others"
+};
+static const char* ompt_cancel_flag_t_values[] = {
+  "ompt_cancel_parallel",
+  "ompt_cancel_sections",
+  "ompt_cancel_do",
+  "ompt_cancel_taskgroup",
+  "ompt_cancel_activated",
+  "ompt_cancel_detected",
+  "ompt_cancel_discarded_task"
+};
+
+static void format_task_type(int type, char *buffer) {
+  char *progress = buffer;
+  if (type & ompt_task_initial)
+    progress += sprintf(progress, "ompt_task_initial");
+  if (type & ompt_task_implicit)
+    progress += sprintf(progress, "ompt_task_implicit");
+  if (type & ompt_task_explicit)
+    progress += sprintf(progress, "ompt_task_explicit");
+  if (type & ompt_task_target)
+    progress += sprintf(progress, "ompt_task_target");
+  if (type & ompt_task_undeferred)
+    progress += sprintf(progress, "|ompt_task_undeferred");
+  if (type & ompt_task_untied)
+    progress += sprintf(progress, "|ompt_task_untied");
+  if (type & ompt_task_final)
+    progress += sprintf(progress, "|ompt_task_final");
+  if (type & ompt_task_mergeable)
+    progress += sprintf(progress, "|ompt_task_mergeable");
+  if (type & ompt_task_merged)
+    progress += sprintf(progress, "|ompt_task_merged");
+}
+
+static ompt_set_callback_t ompt_set_callback;
+static ompt_get_callback_t ompt_get_callback;
+static ompt_get_state_t ompt_get_state;
+static ompt_get_task_info_t ompt_get_task_info;
+static ompt_get_thread_data_t ompt_get_thread_data;
+static ompt_get_parallel_info_t ompt_get_parallel_info;
+static ompt_get_unique_id_t ompt_get_unique_id;
+static ompt_get_num_procs_t ompt_get_num_procs;
+static ompt_get_num_places_t ompt_get_num_places;
+static ompt_get_place_proc_ids_t ompt_get_place_proc_ids;
+static ompt_get_place_num_t ompt_get_place_num;
+static ompt_get_partition_place_nums_t ompt_get_partition_place_nums;
+static ompt_get_proc_id_t ompt_get_proc_id;
+static ompt_enumerate_states_t ompt_enumerate_states;
+static ompt_enumerate_mutex_impls_t ompt_enumerate_mutex_impls;
+
+static void print_ids(int level)
+{
+  int task_type, thread_num;
+  omp_frame_t *frame;
+  ompt_data_t *task_parallel_data;
+  ompt_data_t *task_data;
+  int exists_task = ompt_get_task_info(level, &task_type, &task_data, &frame,
+                                       &task_parallel_data, &thread_num);
+  char buffer[2048];
+  format_task_type(task_type, buffer);
+  if (frame)
+    printf("%" PRIu64 ": task level %d: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", exit_frame=%p, reenter_frame=%p, "
+           "task_type=%s=%d, thread_num=%d\n",
+           ompt_get_thread_data()->value, level,
+           exists_task ? task_parallel_data->value : 0,
+           exists_task ? task_data->value : 0, frame->exit_frame,
+           frame->enter_frame, buffer, task_type, thread_num);
+}
+
+#define get_frame_address(level) __builtin_frame_address(level)
+
+#define print_frame(level)                                                     \
+  printf("%" PRIu64 ": __builtin_frame_address(%d)=%p\n",                      \
+         ompt_get_thread_data()->value, level, get_frame_address(level))
+
+// clang (version 5.0 and above) adds an intermediate function call with debug flag (-g)
+#if defined(TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN)
+  #if defined(DEBUG) && defined(__clang__) && __clang_major__ >= 5
+    #define print_frame_from_outlined_fn(level) print_frame(level+1)
+  #else
+    #define print_frame_from_outlined_fn(level) print_frame(level)
+  #endif
+
+  #if defined(__clang__) && __clang_major__ >= 5
+    #warning "Clang 5.0 and later add an additional wrapper for outlined functions when compiling with debug information."
+    #warning "Please define -DDEBUG iff you manually pass in -g to make the tests succeed!"
+  #endif
+#endif
+
+// This macro helps to define a label at the current position that can be used
+// to get the current address in the code.
+//
+// For print_current_address():
+//   To reliably determine the offset between the address of the label and the
+//   actual return address, we insert a NOP instruction as a jump target as the
+//   compiler would otherwise insert an instruction that we can't control. The
+//   instruction length is target dependent and is explained below.
+//
+// (The empty block between "#pragma omp ..." and the __asm__ statement is a
+// workaround for a bug in the Intel Compiler.)
+#define define_ompt_label(id) \
+  {} \
+  __asm__("nop"); \
+ompt_label_##id:
+
+// This macro helps to get the address of a label that is inserted by the above
+// macro define_ompt_label(). The address is obtained with a GNU extension
+// (&&label) that has been tested with gcc, clang and icc.
+#define get_ompt_label_address(id) (&& ompt_label_##id)
+
+// This macro prints the exact address that a previously called runtime function
+// returns to.
+#define print_current_address(id) \
+  define_ompt_label(id) \
+  print_possible_return_addresses(get_ompt_label_address(id))
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// On X86 the NOP instruction is 1 byte long. In addition, the comiler inserts
+// a MOV instruction for non-void runtime functions which is 3 bytes long.
+#define print_possible_return_addresses(addr) \
+  printf("%" PRIu64 ": current_address=%p or %p for non-void functions\n", \
+         ompt_get_thread_data()->value, ((char *)addr) - 1, ((char *)addr) - 4)
+#elif KMP_ARCH_PPC64
+// On Power the NOP instruction is 4 bytes long. In addition, the compiler
+// inserts an LD instruction which accounts for another 4 bytes. In contrast to
+// X86 this instruction is always there, even for void runtime functions.
+#define print_possible_return_addresses(addr) \
+  printf("%" PRIu64 ": current_address=%p\n", ompt_get_thread_data()->value, \
+         ((char *)addr) - 8)
+#elif KMP_ARCH_AARCH64
+// On AArch64 the NOP instruction is 4 bytes long, can be followed by inserted
+// store instruction (another 4 bytes long).
+#define print_possible_return_addresses(addr) \
+  printf("%" PRIu64 ": current_address=%p or %p\n", ompt_get_thread_data()->value, \
+         ((char *)addr) - 4, ((char *)addr) - 8)
+#else
+#error Unsupported target architecture, cannot determine address offset!
+#endif
+
+
+// This macro performs a somewhat similar job to print_current_address(), except
+// that it discards a certain number of nibbles from the address and only prints
+// the most significant bits / nibbles. This can be used for cases where the
+// return address can only be approximated.
+//
+// To account for overflows (ie the most significant bits / nibbles have just
+// changed as we are a few bytes above the relevant power of two) the addresses
+// of the "current" and of the "previous block" are printed.
+#define print_fuzzy_address(id) \
+  define_ompt_label(id) \
+  print_fuzzy_address_blocks(get_ompt_label_address(id))
+
+// If you change this define you need to adapt all capture patterns in the tests
+// to include or discard the new number of nibbles!
+#define FUZZY_ADDRESS_DISCARD_NIBBLES 2
+#define FUZZY_ADDRESS_DISCARD_BYTES (1 << ((FUZZY_ADDRESS_DISCARD_NIBBLES) * 4))
+#define print_fuzzy_address_blocks(addr)                                       \
+  printf("%" PRIu64 ": fuzzy_address=0x%" PRIx64 " or 0x%" PRIx64              \
+         " or 0x%" PRIx64 " or 0x%" PRIx64 " (%p)\n",                          \
+         ompt_get_thread_data()->value,                                        \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES - 1,                   \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES,                       \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 1,                   \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 2, addr)
+
+static void
+on_ompt_callback_mutex_acquire(
+  ompt_mutex_kind_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  omp_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_wait_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_wait_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_wait_critical: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_wait_atomic: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_wait_ordered: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_mutex_acquired(
+  ompt_mutex_kind_t kind,
+  omp_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_acquired_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_first: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_acquired_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_acquired_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_acquired_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_mutex_released(
+  ompt_mutex_kind_t kind,
+  omp_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_release_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_release_nest_lock_last: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_release_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_release_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_release_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_nest_lock(
+    ompt_scope_endpoint_t endpoint,
+    omp_wait_id_t wait_id,
+    const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_next: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_release_nest_lock_prev: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region(
+  ompt_sync_region_kind_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          print_ids(0);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region_wait(
+  ompt_sync_region_kind_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_wait_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_wait_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_wait_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_flush(
+    ompt_data_t *thread_data,
+    const void *codeptr_ra)
+{
+  printf("%" PRIu64 ": ompt_event_flush: codeptr_ra=%p\n", thread_data->value, codeptr_ra);
+}
+
+static void
+on_ompt_callback_cancel(
+    ompt_data_t *task_data,
+    int flags,
+    const void *codeptr_ra)
+{
+  const char* first_flag_value;
+  const char* second_flag_value;
+  if(flags & ompt_cancel_parallel)
+    first_flag_value = ompt_cancel_flag_t_values[0];
+  else if(flags & ompt_cancel_sections)
+    first_flag_value = ompt_cancel_flag_t_values[1];
+  else if(flags & ompt_cancel_do)
+    first_flag_value = ompt_cancel_flag_t_values[2];
+  else if(flags & ompt_cancel_taskgroup)
+    first_flag_value = ompt_cancel_flag_t_values[3];
+
+  if(flags & ompt_cancel_activated)
+    second_flag_value = ompt_cancel_flag_t_values[4];
+  else if(flags & ompt_cancel_detected)
+    second_flag_value = ompt_cancel_flag_t_values[5];
+  else if(flags & ompt_cancel_discarded_task)
+    second_flag_value = ompt_cancel_flag_t_values[6];
+
+  printf("%" PRIu64 ": ompt_event_cancel: task_data=%" PRIu64 ", flags=%s|%s=%" PRIu32 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, task_data->value, first_flag_value, second_flag_value, flags,  codeptr_ra);
+}
+
+static void
+on_ompt_callback_idle(
+  ompt_scope_endpoint_t endpoint)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_idle_begin:\n", ompt_get_thread_data()->value);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_idle_end:\n", ompt_get_thread_data()->value);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_implicit_task(
+    ompt_scope_endpoint_t endpoint,
+    ompt_data_t *parallel_data,
+    ompt_data_t *task_data,
+    unsigned int team_size,
+    unsigned int thread_num)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      if(task_data->ptr)
+        printf("%s\n", "0: task_data initially not null");
+      task_data->value = ompt_get_unique_id();
+      printf("%" PRIu64 ": ompt_event_implicit_task_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, team_size, thread_num);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_implicit_task_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, team_size, thread_num);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_lock_init(
+  ompt_mutex_kind_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  omp_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_init_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_init_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_lock_destroy(
+  ompt_mutex_kind_t kind,
+  omp_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_destroy_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_destroy_nest_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_work(
+  ompt_work_type_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(wstype)
+      {
+        case ompt_work_loop:
+          printf("%" PRIu64 ": ompt_event_loop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_sections:
+          printf("%" PRIu64 ": ompt_event_sections_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_executor:
+          printf("%" PRIu64 ": ompt_event_single_in_block_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_other:
+          printf("%" PRIu64 ": ompt_event_single_others_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_workshare:
+          //impl
+          break;
+        case ompt_work_distribute:
+          printf("%" PRIu64 ": ompt_event_distribute_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_taskloop:
+          //impl
+          printf("%" PRIu64 ": ompt_event_taskloop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(wstype)
+      {
+        case ompt_work_loop:
+          printf("%" PRIu64 ": ompt_event_loop_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_sections:
+          printf("%" PRIu64 ": ompt_event_sections_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_executor:
+          printf("%" PRIu64 ": ompt_event_single_in_block_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_other:
+          printf("%" PRIu64 ": ompt_event_single_others_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_workshare:
+          //impl
+          break;
+        case ompt_work_distribute:
+          printf("%" PRIu64 ": ompt_event_distribute_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_taskloop:
+          //impl
+          printf("%" PRIu64 ": ompt_event_taskloop_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_master(
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_master_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_master_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_parallel_begin(
+  ompt_data_t *encountering_task_data,
+  const omp_frame_t *encountering_task_frame,
+  ompt_data_t* parallel_data,
+  uint32_t requested_team_size,
+  ompt_invoker_t invoker,
+  const void *codeptr_ra)
+{
+  if(parallel_data->ptr)
+    printf("0: parallel_data initially not null\n");
+  parallel_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_parallel_begin: parent_task_id=%" PRIu64 ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, parallel_id=%" PRIu64 ", requested_team_size=%" PRIu32 ", codeptr_ra=%p, invoker=%d\n", ompt_get_thread_data()->value, encountering_task_data->value, encountering_task_frame->exit_frame, encountering_task_frame->enter_frame, parallel_data->value, requested_team_size, codeptr_ra, invoker);
+}
+
+static void
+on_ompt_callback_parallel_end(
+  ompt_data_t *parallel_data,
+  ompt_data_t *encountering_task_data,
+  ompt_invoker_t invoker,
+  const void *codeptr_ra)
+{
+  printf("%" PRIu64 ": ompt_event_parallel_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", invoker=%d, codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, encountering_task_data->value, invoker, codeptr_ra);
+}
+
+static void
+on_ompt_callback_task_create(
+    ompt_data_t *encountering_task_data,
+    const omp_frame_t *encountering_task_frame,
+    ompt_data_t* new_task_data,
+    int type,
+    int has_dependences,
+    const void *codeptr_ra)
+{
+  if(new_task_data->ptr)
+    printf("0: new_task_data initially not null\n");
+  new_task_data->value = ompt_get_unique_id();
+  char buffer[2048];
+
+  format_task_type(type, buffer);
+
+  //there is no parallel_begin callback for implicit parallel region
+  //thus it is initialized in initial task
+  if(type & ompt_task_initial)
+  {
+    ompt_data_t *parallel_data;
+    ompt_get_parallel_info(0, &parallel_data, NULL);
+    if(parallel_data->ptr)
+      printf("%s\n", "0: parallel_data initially not null");
+    parallel_data->value = ompt_get_unique_id();
+  }
+
+  printf("%" PRIu64 ": ompt_event_task_create: parent_task_id=%" PRIu64 ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, new_task_id=%" PRIu64 ", codeptr_ra=%p, task_type=%s=%d, has_dependences=%s\n", ompt_get_thread_data()->value, encountering_task_data ? encountering_task_data->value : 0, encountering_task_frame ? encountering_task_frame->exit_frame : NULL, encountering_task_frame ? encountering_task_frame->enter_frame : NULL, new_task_data->value, codeptr_ra, buffer, type, has_dependences ? "yes" : "no");
+}
+
+static void
+on_ompt_callback_task_schedule(
+    ompt_data_t *first_task_data,
+    ompt_task_status_t prior_task_status,
+    ompt_data_t *second_task_data)
+{
+  printf("%" PRIu64 ": ompt_event_task_schedule: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 ", prior_task_status=%s=%d\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value, ompt_task_status_t_values[prior_task_status], prior_task_status);
+  if(prior_task_status == ompt_task_complete)
+  {
+    printf("%" PRIu64 ": ompt_event_task_end: task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value);
+  }
+}
+
+static void
+on_ompt_callback_task_dependences(
+  ompt_data_t *task_data,
+  const ompt_task_dependence_t *deps,
+  int ndeps)
+{
+  printf("%" PRIu64 ": ompt_event_task_dependences: task_id=%" PRIu64 ", deps=%p, ndeps=%d\n", ompt_get_thread_data()->value, task_data->value, (void *)deps, ndeps);
+}
+
+static void
+on_ompt_callback_task_dependence(
+  ompt_data_t *first_task_data,
+  ompt_data_t *second_task_data)
+{
+  printf("%" PRIu64 ": ompt_event_task_dependence_pair: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value);
+}
+
+static void
+on_ompt_callback_thread_begin(
+  ompt_thread_type_t thread_type,
+  ompt_data_t *thread_data)
+{
+  if(thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  thread_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, ompt_thread_type_t_values[thread_type], thread_type, thread_data->value);
+}
+
+static void
+on_ompt_callback_thread_end(
+  ompt_data_t *thread_data)
+{
+  printf("%" PRIu64 ": ompt_event_thread_end: thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, thread_data->value);
+}
+
+static int
+on_ompt_callback_control_tool(
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra)
+{
+  omp_frame_t* omptTaskFrame;
+  ompt_get_task_info(0, NULL, (ompt_data_t**) NULL, &omptTaskFrame, NULL, NULL);
+  printf("%" PRIu64 ": ompt_event_control_tool: command=%" PRIu64 ", modifier=%" PRIu64 ", arg=%p, codeptr_ra=%p, current_task_frame.exit=%p, current_task_frame.reenter=%p \n", ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra, omptTaskFrame->exit_frame, omptTaskFrame->enter_frame);
+  return 0; //success
+}
+
+#define register_callback_t(name, type)                       \
+do{                                                           \
+  type f_##name = &on_##name;                                 \
+  if (ompt_set_callback(name, (ompt_callback_t)f_##name) ==   \
+      ompt_set_never)                                         \
+    printf("0: Could not register callback '" #name "'\n");   \
+}while(0)
+
+#define register_callback(name) register_callback_t(name, name##_t)
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_data_t *tool_data)
+{
+  ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+  ompt_get_callback = (ompt_get_callback_t) lookup("ompt_get_callback");
+  ompt_get_state = (ompt_get_state_t) lookup("ompt_get_state");
+  ompt_get_task_info = (ompt_get_task_info_t) lookup("ompt_get_task_info");
+  ompt_get_thread_data = (ompt_get_thread_data_t) lookup("ompt_get_thread_data");
+  ompt_get_parallel_info = (ompt_get_parallel_info_t) lookup("ompt_get_parallel_info");
+  ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
+
+  ompt_get_num_procs = (ompt_get_num_procs_t) lookup("ompt_get_num_procs");
+  ompt_get_num_places = (ompt_get_num_places_t) lookup("ompt_get_num_places");
+  ompt_get_place_proc_ids = (ompt_get_place_proc_ids_t) lookup("ompt_get_place_proc_ids");
+  ompt_get_place_num = (ompt_get_place_num_t) lookup("ompt_get_place_num");
+  ompt_get_partition_place_nums = (ompt_get_partition_place_nums_t) lookup("ompt_get_partition_place_nums");
+  ompt_get_proc_id = (ompt_get_proc_id_t) lookup("ompt_get_proc_id");
+  ompt_enumerate_states = (ompt_enumerate_states_t) lookup("ompt_enumerate_states");
+  ompt_enumerate_mutex_impls = (ompt_enumerate_mutex_impls_t) lookup("ompt_enumerate_mutex_impls");
+
+  register_callback(ompt_callback_mutex_acquire);
+  register_callback_t(ompt_callback_mutex_acquired, ompt_callback_mutex_t);
+  register_callback_t(ompt_callback_mutex_released, ompt_callback_mutex_t);
+  register_callback(ompt_callback_nest_lock);
+  register_callback(ompt_callback_sync_region);
+  register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback(ompt_callback_control_tool);
+  register_callback(ompt_callback_flush);
+  register_callback(ompt_callback_cancel);
+  register_callback(ompt_callback_idle);
+  register_callback(ompt_callback_implicit_task);
+  register_callback_t(ompt_callback_lock_init, ompt_callback_mutex_acquire_t);
+  register_callback_t(ompt_callback_lock_destroy, ompt_callback_mutex_t);
+  register_callback(ompt_callback_work);
+  register_callback(ompt_callback_master);
+  register_callback(ompt_callback_parallel_begin);
+  register_callback(ompt_callback_parallel_end);
+  register_callback(ompt_callback_task_create);
+  register_callback(ompt_callback_task_schedule);
+  register_callback(ompt_callback_task_dependences);
+  register_callback(ompt_callback_task_dependence);
+  register_callback(ompt_callback_thread_begin);
+  register_callback(ompt_callback_thread_end);
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
diff --git a/final/runtime/test/ompt/cancel/cancel_parallel.c b/final/runtime/test/ompt/cancel/cancel_parallel.c
new file mode 100644
index 0000000..b03239d
--- /dev/null
+++ b/final/runtime/test/ompt/cancel/cancel_parallel.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implementation does not support cancellation
+// XFAIL: gcc
+
+#include "callback.h"
+#include "omp.h"
+
+int main() {
+  #pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 0) {
+      print_fuzzy_address_blocks(get_ompt_label_address(1));
+      #pragma omp cancel parallel
+      define_ompt_label(1);
+      // We cannot print at this location because the parallel region is cancelled!
+    } else {
+      delay(100);
+      print_fuzzy_address_blocks(get_ompt_label_address(2));
+      #pragma omp cancellation point parallel
+      define_ompt_label(2);
+      // We cannot print at this location because the parallel region is cancelled!
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_parallel|ompt_cancel_activated=17, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_parallel|ompt_cancel_detected=33, codeptr_ra=[[OTHER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[OTHER_RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/cancel/cancel_taskgroup.c b/final/runtime/test/ompt/cancel/cancel_taskgroup.c
new file mode 100644
index 0000000..803fa97
--- /dev/null
+++ b/final/runtime/test/ompt/cancel/cancel_taskgroup.c
@@ -0,0 +1,89 @@
+// RUN:  %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: clang-3, clang-4.0.0
+// Current GOMP interface implementation does not support cancellation; icc 16 has a bug
+// XFAIL: gcc, icc-16
+
+#include "callback.h"
+#include <unistd.h>  
+#include <stdio.h>
+
+int main()
+{
+  int condition=0;
+  #pragma omp parallel num_threads(2)
+  {}
+
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp taskgroup
+      {
+        #pragma omp task shared(condition)
+        {
+          printf("start execute task 1\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 1\n");
+        }
+        #pragma omp task shared(condition)
+        {
+          printf("start execute task 2\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 2\n");
+        }
+      #pragma omp task shared(condition)
+        {
+          printf("start execute task 3\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 3\n");
+        }
+      #pragma omp task if(0) shared(condition)
+        {
+          printf("start execute task 4\n");
+          OMPT_WAIT(condition,1);
+          #pragma omp cancel taskgroup
+          printf("end execute task 4\n");
+        }
+        OMPT_SIGNAL(condition);
+      }
+    }
+    #pragma omp barrier
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[FIRST_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[SECOND_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[THIRD_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[CANCEL_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[PARENT_TASK_ID]], second_task_id=[[CANCEL_TASK_ID]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[CANCEL_TASK_ID]], flags=ompt_cancel_taskgroup|ompt_cancel_activated=24, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[CANCEL_TASK_ID]], second_task_id=[[PARENT_TASK_ID]], prior_task_status=ompt_task_cancel=3
+
+  // CHECK-DAG: {{^}}{{[0-9]+}}: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_discarded_task=72, codeptr_ra=[[NULL]]
+  // CHECK-DAG: {{^}}{{[0-9]+}}: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_discarded_task=72, codeptr_ra=[[NULL]]
+  
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_detected=40, codeptr_ra={{0x[0-f]*}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/cancel/cancel_worksharing.c b/final/runtime/test/ompt/cancel/cancel_worksharing.c
new file mode 100644
index 0000000..db3b168
--- /dev/null
+++ b/final/runtime/test/ompt/cancel/cancel_worksharing.c
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implementation does not support cancellation; icc 16 does not distinguish between sections and loops
+// XFAIL: gcc, icc-16
+
+#include "callback.h"
+#include <unistd.h>
+
+int main()
+{
+  int condition=0;
+  #pragma omp parallel num_threads(2)
+  {
+    int x = 0;
+    int i;
+    #pragma omp for
+    for(i = 0; i < 2; i++)
+    {
+      if(i == 0)
+      {
+        x++;
+        OMPT_SIGNAL(condition);
+        #pragma omp cancel for
+      }
+      else
+      {
+        x++;
+        OMPT_WAIT(condition,1);
+        delay(10000);
+        #pragma omp cancellation point for
+      }
+    }
+  }
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp sections
+    {
+      #pragma omp section
+      {
+        OMPT_SIGNAL(condition);
+        #pragma omp cancel sections
+      }
+      #pragma omp section
+      {
+        OMPT_WAIT(condition,2);
+        delay(10000);
+        #pragma omp cancellation point sections
+      }
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  
+  // cancel for and sections
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_do|ompt_cancel_activated=20, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_sections|ompt_cancel_activated=18, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[OTHER_THREAD_ID:[0-9]+]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_do|ompt_cancel_detected=36, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[OTHER_THREAD_ID:[0-9]+]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_sections|ompt_cancel_detected=34, codeptr_ra={{0x[0-f]*}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/loadtool/tool_available/tool_available.c b/final/runtime/test/ompt/loadtool/tool_available/tool_available.c
new file mode 100644
index 0000000..fbbdadd
--- /dev/null
+++ b/final/runtime/test/ompt/loadtool/tool_available/tool_available.c
@@ -0,0 +1,74 @@
+// The OpenMP standard defines 3 ways of providing ompt_start_tool:
+// 1. "statically-linking the tool’s definition of ompt_start_tool into an OpenMP application"
+// RUN: %libomp-compile -DCODE -DTOOL && %libomp-run | FileCheck %s
+
+// Note: We should compile the tool without -fopenmp as other tools developer
+//       would do. Otherwise this test may pass for the wrong reasons on Darwin.
+// RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
+// 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
+// 2.1 Link with tool during compilation
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.2 Link with tool during compilation, but AFTER the runtime
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.3 Inject tool via the dynamic loader
+// RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
+
+// 3. "providing the name of a dynamically-linked library appropriate for the architecture and operating system used by the application in the tool-libraries-var ICV"
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+
+// REQUIRES: ompt
+
+/*
+ *  This file contains code for an OMPT shared library tool to be
+ *  loaded and the code for the OpenMP executable.
+ *  -DTOOL enables the code for the tool during compilation
+ *  -DCODE enables the code for the executable during compilation
+ */
+
+#ifdef CODE
+#include "omp.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}0: ompt_event_runtime_shutdown
+
+  return 0;
+}
+
+#endif /* CODE */
+
+#ifdef TOOL
+
+#include <stdio.h>
+#include <ompt.h>
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_data_t* tool_data)
+{
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_data_t* tool_data)
+{
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
+#endif /* TOOL */
diff --git a/final/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c b/final/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c
new file mode 100644
index 0000000..a6fe8e9
--- /dev/null
+++ b/final/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c
@@ -0,0 +1,104 @@
+// RUN: %clang %flags -shared -fPIC %s -o %T/first_tool.so
+// RUN: %clang %flags -DTOOL -DSECOND_TOOL -shared -fPIC %s -o %T/second_tool.so
+// RUN: %clang %flags -DTOOL -DTHIRD_TOOL -shared -fPIC %s -o %T/third_tool.so
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/non_existing_file.so:%T/first_tool.so:%T/second_tool.so:%T/third_tool.so %libomp-run | FileCheck %s
+
+// REQUIRES: ompt
+
+/*
+ *  This file contains code for three OMPT shared library tool to be 
+ *  loaded and the code for the OpenMP executable. 
+ *  No option enables code for the first shared library 
+ *  (without an implementation of ompt_start_tool) during compilation
+ *  -DTOOL -DSECOND_TOOL enables the code for the second tool during compilation
+ *  -DTOOL -DTHIRD_TOOL enables the code for the third tool during compilation
+ *  -DCODE enables the code for the executable during compilation
+ */
+
+#ifdef CODE
+#include "stdio.h"
+#include "omp.h"
+#include "ompt.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      int result = omp_control_tool(omp_control_tool_start, 0, NULL);
+      printf("0: control_tool()=%d\n", result);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 
+  
+  // CHECK: {{^}}0: Do not initialize tool
+
+  // CHECK: {{^}}0: Do initialize tool
+  // CHECK: {{^}}0: Tool initialized
+  // CHECK: {{^}}0: ompt_event_thread_begin
+  // CHECK-DAG: {{^}}0: ompt_event_thread_begin
+  // CHECK-DAG: {{^}}0: control_tool()=-1
+  // CHECK: {{^}}0: Tool finalized
+  
+
+  return 0;
+}
+
+#endif /* CODE */
+
+#ifdef TOOL
+
+#include <ompt.h>
+#include "stdio.h"
+
+#ifdef SECOND_TOOL
+// The second tool has an implementation of ompt_start_tool that returns NULL
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  printf("0: Do not initialize tool\n");
+  return NULL;
+}
+#elif defined(THIRD_TOOL)
+// The third tool has an implementation of ompt_start_tool that returns a 
+// pointer to a valid instance of ompt_start_tool_result_t
+
+static void
+on_ompt_callback_thread_begin(
+  ompt_thread_type_t thread_type,
+  ompt_data_t *thread_data)
+{
+  printf("0: ompt_event_thread_begin\n");
+}
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_data_t *tool_data)
+{
+  ompt_set_callback_t ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+  ompt_set_callback(ompt_callback_thread_begin, (ompt_callback_t)on_ompt_callback_thread_begin);
+  printf("0: Tool initialized\n");
+  return 1;
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+  printf("0: Tool finalized\n");
+}
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  printf("0: Do initialize tool\n");
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
+#endif
+
+#endif /* TOOL */
diff --git a/final/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c b/final/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
new file mode 100644
index 0000000..b0d3f2b
--- /dev/null
+++ b/final/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
@@ -0,0 +1,69 @@
+// The OpenMP standard defines 3 ways of providing ompt_start_tool:
+// 1. "statically-linking the tool’s definition of ompt_start_tool into an OpenMP application"
+// RUN: %libomp-compile -DCODE -DTOOL && %libomp-run | FileCheck %s
+
+// Note: We should compile the tool without -fopenmp as other tools developer
+//       would do. Otherwise this test may pass for the wrong reasons on Darwin.
+// RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
+// 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
+// 2.1 Link with tool during compilation
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.2 Link with tool during compilation, but AFTER the runtime
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.3 Inject tool via the dynamic loader
+// RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
+
+// 3. "providing the name of a dynamically-linked library appropriate for the architecture and operating system used by the application in the tool-libraries-var ICV"
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+
+// REQUIRES: ompt
+
+/*
+ *  This file contains code for an OMPT shared library tool to be 
+ *  loaded and the code for the OpenMP executable. 
+ *  -DTOOL enables the code for the tool during compilation
+ *  -DCODE enables the code for the executable during compilation
+ */
+
+#ifdef CODE
+#include "stdio.h"
+#include "omp.h"
+#include "ompt.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      int result = omp_control_tool(omp_control_tool_start, 0, NULL);
+      printf("0: control_tool()=%d\n", result);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 
+  
+  // CHECK: {{^}}0: Do not initialize tool
+  // CHECK: {{^}}0: control_tool()=-2
+  
+
+  return 0;
+}
+
+#endif /* CODE */
+
+#ifdef TOOL
+
+#include <ompt.h>
+#include "stdio.h"
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  printf("0: Do not initialize tool\n");
+  return NULL;
+}
+#endif /* TOOL */
diff --git a/final/runtime/test/ompt/misc/api_calls_from_other_thread.cpp b/final/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
new file mode 100644
index 0000000..470d7cd
--- /dev/null
+++ b/final/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
@@ -0,0 +1,92 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s
+// REQUIRES: ompt, linux
+
+#include <thread>
+#include "callback.h"
+
+void f() {
+  ompt_data_t *tdata = ompt_get_thread_data();
+  uint64_t tvalue = tdata ? tdata->value : 0;
+
+  printf("%" PRIu64 ": ompt_get_num_places()=%d\n", tvalue,
+         ompt_get_num_places());
+
+  printf("%" PRIu64 ": ompt_get_place_proc_ids()=%d\n", tvalue,
+         ompt_get_place_proc_ids(0, 0, NULL));
+
+  printf("%" PRIu64 ": ompt_get_place_num()=%d\n", tvalue,
+         ompt_get_place_num());
+
+  printf("%" PRIu64 ": ompt_get_partition_place_nums()=%d\n", tvalue,
+         ompt_get_partition_place_nums(0, NULL));
+
+  printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", tvalue, ompt_get_proc_id());
+
+  printf("%" PRIu64 ": ompt_get_num_procs()=%d\n", tvalue,
+         ompt_get_num_procs());
+
+  ompt_callback_t callback;
+  printf("%" PRIu64 ": ompt_get_callback()=%d\n", tvalue,
+         ompt_get_callback(ompt_callback_thread_begin, &callback));
+
+  printf("%" PRIu64 ": ompt_get_state()=%d\n", tvalue, ompt_get_state(NULL));
+
+  int state = omp_state_undefined;
+  const char *state_name;
+  printf("%" PRIu64 ": ompt_enumerate_states()=%d\n", tvalue,
+         ompt_enumerate_states(state, &state, &state_name));
+
+  int impl = ompt_mutex_impl_unknown;
+  const char *impl_name;
+  printf("%" PRIu64 ": ompt_enumerate_mutex_impls()=%d\n", tvalue,
+         ompt_enumerate_mutex_impls(impl, &impl, &impl_name));
+
+  printf("%" PRIu64 ": ompt_get_thread_data()=%p\n", tvalue,
+         ompt_get_thread_data());
+
+  printf("%" PRIu64 ": ompt_get_parallel_info()=%d\n", tvalue,
+         ompt_get_parallel_info(0, NULL, NULL));
+
+  printf("%" PRIu64 ": ompt_get_task_info()=%d\n", tvalue,
+         ompt_get_task_info(0, NULL, NULL, NULL, NULL, NULL));
+}
+
+int main() {
+#pragma omp parallel num_threads(1)
+  {}
+
+  std::thread t1(f);
+  t1.join();
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_get_num_places()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_proc_ids()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_num()=-1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_partition_place_nums()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_proc_id()=-1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_procs()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_callback()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_state()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_states()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_mutex_impls()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_thread_data()=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_parallel_info()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_task_info()=0
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/api_calls_misc.c b/final/runtime/test/ompt/misc/api_calls_misc.c
new file mode 100644
index 0000000..d567b1b
--- /dev/null
+++ b/final/runtime/test/ompt/misc/api_calls_misc.c
@@ -0,0 +1,72 @@
+// RUN: %libomp-compile && %libomp-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+#pragma omp parallel num_threads(1)
+  {
+    // ompt_get_callback()
+    ompt_callback_t callback;
+    ompt_get_callback(ompt_callback_thread_begin, &callback);
+    printf("%" PRIu64 ": &on_ompt_callback_thread_begin=%p\n",
+           ompt_get_thread_data()->value, &on_ompt_callback_thread_begin);
+    printf("%" PRIu64 ": ompt_get_callback() result=%p\n",
+           ompt_get_thread_data()->value, callback);
+
+    // ompt_get_state()
+    printf("%" PRIu64 ": ompt_get_state()=%d\n", ompt_get_thread_data()->value,
+           ompt_get_state(NULL));
+
+    // ompt_enumerate_states()
+    int state = omp_state_undefined;
+    const char *state_name;
+    int steps = 0;
+    while (ompt_enumerate_states(state, &state, &state_name) && steps < 1000) {
+      steps++;
+      if (!state_name)
+        printf("%" PRIu64 ": state_name is NULL\n",
+               ompt_get_thread_data()->value);
+    }
+    if (steps >= 1000) {
+      // enumeration did not end after 1000 steps
+      printf("%" PRIu64 ": states enumeration did not end\n",
+             ompt_get_thread_data()->value);
+    }
+
+    // ompt_enumerate_mutex_impls()
+    int impl = ompt_mutex_impl_unknown;
+    const char *impl_name;
+    steps = 0;
+    while (ompt_enumerate_mutex_impls(impl, &impl, &impl_name) &&
+           steps < 1000) {
+      steps++;
+      if (!impl_name)
+        printf("%" PRIu64 ": impl_name is NULL\n",
+               ompt_get_thread_data()->value);
+    }
+    if (steps >= 1000) {
+      // enumeration did not end after 1000 steps
+      printf("%" PRIu64 ": mutex_impls enumeration did not end\n",
+             ompt_get_thread_data()->value);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: &on_ompt_callback_thread_begin
+  // CHECK-SAME: =[[FUNCTION_POINTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_get_callback() result=[[FUNCTION_POINTER]]
+
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_get_state()=1
+
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: state_name is NULL
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: states enumeration did not end
+
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: impl_name is NULL
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: mutex_impls enumeration did not end
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/api_calls_places.c b/final/runtime/test/ompt/misc/api_calls_places.c
new file mode 100644
index 0000000..ad338a7
--- /dev/null
+++ b/final/runtime/test/ompt/misc/api_calls_places.c
@@ -0,0 +1,88 @@
+// RUN: %libomp-compile && env OMP_PLACES=cores %libomp-run | FileCheck %s
+// REQUIRES: ompt, linux
+#include "callback.h"
+#include <omp.h>
+#define __USE_GNU
+#include <sched.h>
+#undef __USE_GNU
+
+void print_list(char *function_name, int size, int list[]) {
+  printf("%" PRIu64 ": %s(0)=(%d", ompt_get_thread_data()->value, function_name,
+         list[0]);
+  int i;
+  for (i = 1; i < size; i++) {
+    printf(",%d", list[i]);
+  }
+  printf(")\n");
+}
+
+int main() {
+#pragma omp parallel num_threads(1)
+  {
+    printf("%" PRIu64 ": omp_get_num_places()=%d\n",
+           ompt_get_thread_data()->value, omp_get_num_places());
+    printf("%" PRIu64 ": ompt_get_num_places()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_num_places());
+
+    int omp_ids_size = omp_get_place_num_procs(0);
+    int omp_ids[omp_ids_size];
+    omp_get_place_proc_ids(0, omp_ids);
+    print_list("omp_get_place_proc_ids", omp_ids_size, omp_ids);
+    int ompt_ids_size = ompt_get_place_proc_ids(0, 0, NULL);
+    int ompt_ids[ompt_ids_size];
+    ompt_get_place_proc_ids(0, ompt_ids_size, ompt_ids);
+    print_list("ompt_get_place_proc_ids", ompt_ids_size, ompt_ids);
+
+    printf("%" PRIu64 ": omp_get_place_num()=%d\n",
+           ompt_get_thread_data()->value, omp_get_place_num());
+    printf("%" PRIu64 ": ompt_get_place_num()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_place_num());
+
+    int omp_nums_size = omp_get_partition_num_places();
+    int omp_nums[omp_nums_size];
+    omp_get_partition_place_nums(omp_nums);
+    print_list("omp_get_partition_place_nums", omp_nums_size, omp_nums);
+    int ompt_nums_size = ompt_get_partition_place_nums(0, NULL);
+    int ompt_nums[ompt_nums_size];
+    ompt_get_partition_place_nums(ompt_nums_size, ompt_nums);
+    print_list("ompt_get_partition_place_nums", ompt_nums_size, ompt_nums);
+
+    printf("%" PRIu64 ": sched_getcpu()=%d\n", ompt_get_thread_data()->value,
+           sched_getcpu());
+    printf("%" PRIu64 ": ompt_get_proc_id()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_proc_id());
+
+    printf("%" PRIu64 ": omp_get_num_procs()=%d\n",
+           ompt_get_thread_data()->value, omp_get_num_procs());
+    printf("%" PRIu64 ": ompt_get_num_procs()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_num_procs());
+  }
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: omp_get_num_places
+  // CHECK-SAME: ()=[[NUM_PLACES:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_places()=[[NUM_PLACES]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_place_proc_ids
+  // CHECK-SAME: (0)=([[PROC_IDS:[0-9\,]+]])
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_proc_ids(0)=([[PROC_IDS]])
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_place_num()=[[PLACE_NUM:[-]?[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_num()=[[PLACE_NUM]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_partition_place_nums
+  // CHECK-SAME: (0)=([[PARTITION_PLACE_NUMS:[0-9\,]+]])
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_partition_place_nums
+  // CHECK-SAME: (0)=([[PARTITION_PLACE_NUMS]])
+
+  // CHECK: {{^}}[[MASTER_ID]]: sched_getcpu()=[[CPU_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_proc_id()=[[CPU_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_num_procs()=[[NUM_PROCS:[-]?[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_procs()=[[NUM_PROCS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/control_tool.c b/final/runtime/test/ompt/misc/control_tool.c
new file mode 100644
index 0000000..2c59666
--- /dev/null
+++ b/final/runtime/test/ompt/misc/control_tool.c
@@ -0,0 +1,29 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+    print_frame_from_outlined_fn(1);
+    print_frame(0);
+    omp_control_tool(omp_control_tool_flush, 1, NULL);
+    print_current_address(0);
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_control_tool'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address({{.}})=[[EXIT_FRAME:0x[0-f]*]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER_FRAME:0x[0-f]*]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_control_tool: command=3, modifier=1, arg=[[NULL]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]*]], current_task_frame.exit=[[EXIT_FRAME]], current_task_frame.reenter=[[REENTER_FRAME]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/control_tool_no_ompt_support.c b/final/runtime/test/ompt/misc/control_tool_no_ompt_support.c
new file mode 100644
index 0000000..ee64da0
--- /dev/null
+++ b/final/runtime/test/ompt/misc/control_tool_no_ompt_support.c
@@ -0,0 +1,12 @@
+// RUN: %libomp-compile-and-run
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+    omp_control_tool(omp_control_tool_flush, 1, NULL);
+  }
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/idle.c b/final/runtime/test/ompt/misc/idle.c
new file mode 100644
index 0000000..7413c32
--- /dev/null
+++ b/final/runtime/test/ompt/misc/idle.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(3)
+  {
+    #pragma omp atomic
+    x++;
+  }
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+  }
+
+
+  printf("x=%d\n", x);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_idle'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_idle_begin:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_idle_end:
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/interoperability.cpp b/final/runtime/test/ompt/misc/interoperability.cpp
new file mode 100644
index 0000000..102e6de
--- /dev/null
+++ b/final/runtime/test/ompt/misc/interoperability.cpp
@@ -0,0 +1,115 @@
+// RUN: %libomp-cxx-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+#include <iostream>
+#include <thread>
+#include <alloca.h>
+
+#include "callback.h"
+#include "omp.h"
+
+int condition = 0;
+
+void f() {
+  // Call OpenMP API function to force initialization of OMPT.
+  // (omp_get_thread_num() does not work because it just returns 0 if the
+  // runtime isn't initialized yet...)
+  omp_get_num_threads();
+
+  // Call alloca() to force availability of frame pointer
+  void *p = alloca(0);
+
+  OMPT_SIGNAL(condition);
+  // Wait for both initial threads to arrive that will eventually become the
+  // master threads in the following parallel region.
+  OMPT_WAIT(condition, 2);
+
+#pragma omp parallel num_threads(2)
+  {
+    // Wait for all threads to arrive so that no worker thread can be reused...
+    OMPT_SIGNAL(condition);
+    OMPT_WAIT(condition, 6);
+  }
+}
+
+int main() {
+  std::thread t1(f);
+  std::thread t2(f);
+  t1.join();
+  t2.join();
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// first master thread
+// CHECK: {{^}}[[MASTER_ID_1:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID_1]]
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_task_create: parent_task_id=0
+// CHECK-SAME: parent_task_frame.exit=[[NULL]]
+// CHECK-SAME: parent_task_frame.reenter=[[NULL]]
+// CHECK-SAME: new_task_id=[[PARENT_TASK_ID_1:[0-9]+]]
+// CHECK-SAME: codeptr_ra=[[NULL]], task_type=ompt_task_initial=1
+// CHECK-SAME: has_dependences=no
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID_1]]
+// CHECK-SAME: parent_task_frame.exit=[[NULL]]
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]], requested_team_size=2
+// CHECK-SAME: codeptr_ra=0x{{[0-f]+}}, invoker={{.*}}
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_1]], task_id=[[PARENT_TASK_ID_1]]
+// CHECK-SAME: invoker={{[0-9]+}}
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[MASTER_ID_1]]
+
+// second master thread
+// CHECK: {{^}}[[MASTER_ID_2:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID_2]]
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_task_create: parent_task_id=0
+// CHECK-SAME: parent_task_frame.exit=[[NULL]]
+// CHECK-SAME: parent_task_frame.reenter=[[NULL]]
+// CHECK-SAME: new_task_id=[[PARENT_TASK_ID_2:[0-9]+]]
+// CHECK-SAME: codeptr_ra=[[NULL]], task_type=ompt_task_initial=1
+// CHECK-SAME: has_dependences=no
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID_2]]
+// CHECK-SAME: parent_task_frame.exit=[[NULL]]
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_2:[0-9]+]]
+// CHECK-SAME: requested_team_size=2, codeptr_ra=0x{{[0-f]+}}
+// CHECK-SAME: invoker={{.*}}
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_2]], task_id=[[PARENT_TASK_ID_2]]
+// CHECK-SAME: invoker={{[0-9]+}}
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[MASTER_ID_2]]
+
+// first worker thread
+// CHECK: {{^}}[[THREAD_ID_1:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID_1]]
+
+// CHECK: {{^}}[[THREAD_ID_1]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[THREAD_ID_1]]
+
+// second worker thread
+// CHECK: {{^}}[[THREAD_ID_2:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID_2]]
+
+// CHECK: {{^}}[[THREAD_ID_2]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[THREAD_ID_2]]
diff --git a/final/runtime/test/ompt/misc/threads.c b/final/runtime/test/ompt/misc/threads.c
new file mode 100644
index 0000000..4a0fc6f
--- /dev/null
+++ b/final/runtime/test/ompt/misc/threads.c
@@ -0,0 +1,34 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp atomic
+    x++;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/threads_nested.c b/final/runtime/test/ompt/misc/threads_nested.c
new file mode 100644
index 0000000..0d38dcf
--- /dev/null
+++ b/final/runtime/test/ompt/misc/threads_nested.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+
+  int condition = 0;
+  int x = 0;
+  omp_set_nested(1);
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp parallel num_threads(2)
+    {
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+    }
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/unset_callback.c b/final/runtime/test/ompt/misc/unset_callback.c
new file mode 100644
index 0000000..9074ad3
--- /dev/null
+++ b/final/runtime/test/ompt/misc/unset_callback.c
@@ -0,0 +1,29 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+
+  }
+  ompt_set_callback(ompt_callback_parallel_begin, NULL);
+  #pragma omp parallel num_threads(1)
+  {
+
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_idle'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/ompt-signal.h b/final/runtime/test/ompt/ompt-signal.h
new file mode 100644
index 0000000..b5c28cf
--- /dev/null
+++ b/final/runtime/test/ompt/ompt-signal.h
@@ -0,0 +1,31 @@
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#else
+#include <unistd.h>
+#define delay(t) usleep(t);
+#endif
+
+// These functions are used to provide a signal-wait mechanism to enforce expected scheduling for the test cases.
+// Conditional variable (s) needs to be shared! Initialize to 0
+
+#define OMPT_SIGNAL(s) ompt_signal(&s)
+//inline 
+void ompt_signal(int* s) 
+{                
+  #pragma omp atomic
+  (*s)++;
+}
+                
+#define OMPT_WAIT(s,v) ompt_wait(&s,v)
+// wait for s >= v
+//inline 
+void ompt_wait(int *s, int v)
+{
+  int wait=0;
+  do{
+    delay(10);
+    #pragma omp atomic read
+	  wait = (*s);
+  }while(wait<v);
+}
diff --git a/final/runtime/test/ompt/parallel/dynamic_enough_threads.c b/final/runtime/test/ompt/parallel/dynamic_enough_threads.c
new file mode 100644
index 0000000..4c340ba
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/dynamic_enough_threads.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_dynamic(1);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+  //team-size of 1-4 is expected
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[1-4]}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/dynamic_not_enough_threads.c b/final/runtime/test/ompt/parallel/dynamic_not_enough_threads.c
new file mode 100644
index 0000000..f3a6e17
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/dynamic_not_enough_threads.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_dynamic(1);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+  //team-size of 1-4 is expected
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[1-4]}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/max_active_levels_serialized.c b/final/runtime/test/ompt/parallel/max_active_levels_serialized.c
new file mode 100644
index 0000000..bbe73ef
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/max_active_levels_serialized.c
@@ -0,0 +1,73 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_set_nested(1);
+  omp_set_max_active_levels(1);
+
+  #pragma omp parallel num_threads(2)
+  {
+    print_ids(0);
+    print_ids(1);
+    #pragma omp parallel num_threads(2)
+    {
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested.c b/final/runtime/test/ompt/parallel/nested.c
new file mode 100644
index 0000000..035529c
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested.c
@@ -0,0 +1,298 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(1);
+  print_frame(0);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+
+    //get all implicit task events before starting nested:
+    #pragma omp barrier
+    
+    #pragma omp parallel num_threads(4)
+    {
+      print_frame_from_outlined_fn(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      print_frame(0);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition,16);
+      #pragma omp barrier
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_fuzzy_address(2);
+    print_ids(0);
+  }
+  print_fuzzy_address(3);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+
+  // THREADS: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // nested parallel masters
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[NESTED_EXIT:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // explicit barrier
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NESTED_REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+  // implicit barrier
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // implicit barrier
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // nested parallel worker threads
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested_lwt.c b/final/runtime/test/ompt/parallel/nested_lwt.c
new file mode 100644
index 0000000..8348376
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested_lwt.c
@@ -0,0 +1,334 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main()
+{
+  omp_set_nested(1);
+  int condition = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+    //get all implicit task events before starting nested:
+    #pragma omp barrier
+    #pragma omp parallel num_threads(1)
+    {
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      //get all implicit task events before starting nested:
+      #pragma omp barrier
+      #pragma omp parallel num_threads(4)
+      {
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition,16);
+      }
+      print_fuzzy_address(1);
+    }
+    print_fuzzy_address(2);
+  }
+  print_fuzzy_address(3);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // nested parallel masters
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // nested parallel worker threads
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested_serialized.c b/final/runtime/test/ompt/parallel/nested_serialized.c
new file mode 100644
index 0000000..f87b8f4
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested_serialized.c
@@ -0,0 +1,128 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_set_nested(0);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+    #pragma omp parallel num_threads(4)
+    {
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested_thread_num.c b/final/runtime/test/ompt/parallel/nested_thread_num.c
new file mode 100644
index 0000000..e952f80
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested_thread_num.c
@@ -0,0 +1,357 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main() {
+  int condition = 0;
+  omp_set_nested(1);
+  print_frame(0);
+
+#pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+
+// get all implicit task events before starting nested:
+#pragma omp barrier
+
+#pragma omp parallel num_threads(2)
+    {
+      print_frame_from_outlined_fn(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      print_frame(0);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+#pragma omp barrier
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_fuzzy_address(2);
+    print_ids(0);
+  }
+  print_fuzzy_address(3);
+
+  return 0;
+}
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// make sure initial data pointers are null
+// CHECK-NOT: 0: parallel_data initially not null
+// CHECK-NOT: 0: task_data initially not null
+// CHECK-NOT: 0: thread_data initially not null
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]],
+// CHECK-SAME: parent_task_frame.exit=[[NULL]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=2,
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+
+// Note that we cannot ensure that the worker threads have already called
+// barrier_end and implicit_task_end before parallel_end!
+
+// CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+
+
+// CHECK: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], 
+// CHECK-SAME: task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// THREADS: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+// THREADS: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+// THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], 
+// THREADS-SAME: parent_task_frame.exit=[[NULL]],
+// THREADS-SAME: parent_task_frame.reenter=[[MAIN_REENTER]],
+// THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// nested parallel masters
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]],
+// THREADS-SAME: team_size=2, thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[MAIN_REENTER]]
+
+// THREADS: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit=[[EXIT]],
+// THREADS-SAME: parent_task_frame.reenter=[[REENTER]],
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[NESTED_EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// THREADS-SAME: reenter_frame=[[REENTER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[MAIN_REENTER]]
+
+// THREADS: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+// explicit barrier
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NESTED_REENTER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]], 
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// Worker of first nesting level
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// THREADS-SAME: thread_num=[[INNER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: thread_num=[[INNER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// other nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
diff --git a/final/runtime/test/ompt/parallel/no_thread_num_clause.c b/final/runtime/test/ompt/parallel/no_thread_num_clause.c
new file mode 100644
index 0000000..e23d89a
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/no_thread_num_clause.c
@@ -0,0 +1,95 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_num_threads(4);
+  #pragma omp parallel
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=0, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id=281474976710658, codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/normal.c b/final/runtime/test/ompt/parallel/normal.c
new file mode 100644
index 0000000..2cc9ce1
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/normal.c
@@ -0,0 +1,132 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads \
+// RUN:                         | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main() {
+#pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // Only check callback names, arguments are verified in THREADS below.
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+  // Note that we cannot ensure that the worker threads have already called
+  // barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]]
+  // THREADS-SAME: parent_task_frame.exit=[[NULL]]
+  // THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/not_enough_threads.c b/final/runtime/test/ompt/parallel/not_enough_threads.c
new file mode 100644
index 0000000..8a0469a
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/not_enough_threads.c
@@ -0,0 +1,90 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | FileCheck %s
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | %sort-threads \
+// RUN:     | FileCheck --check-prefix=THREADS %s
+
+// REQUIRES: ompt
+
+#include "callback.h"
+
+int main() {
+#pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // Make sure initial data pointers are null.
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // Only check callback names, arguments are verified in THREADS below.
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+  // Note that we cannot ensure that the worker threads have already called
+  // barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]]
+  // THREADS-SAME: parent_task_frame.exit=[[NULL]]
+  // THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/parallel_if0.c b/final/runtime/test/ompt/parallel/parallel_if0.c
new file mode 100644
index 0000000..f5c4454
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/parallel_if0.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+
+int main()
+{
+//  print_frame(0);
+  #pragma omp parallel if(0)
+  {
+//    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+//    print_frame(0);
+    #pragma omp parallel if(0)
+    {
+//      print_frame(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+//      print_frame(0);
+      #pragma omp task
+      {
+//        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      }
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_end'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[NESTED_IMPLICIT_TASK_ID]], second_task_id=[[EXPLICIT_TASK_ID]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[EXPLICIT_TASK_ID]], second_task_id=[[NESTED_IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[EXPLICIT_TASK_ID]]
+
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/serialized.c b/final/runtime/test/ompt/parallel/serialized.c
new file mode 100644
index 0000000..e7a9207
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/serialized.c
@@ -0,0 +1,77 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+
+int main()
+{
+//  print_frame(0);
+  #pragma omp parallel num_threads(1)
+  {
+//    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+//    print_frame(0);
+    #pragma omp parallel num_threads(1)
+    {
+//      print_frame(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+//      print_frame(0);
+      #pragma omp task
+      {
+//        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      }
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_end'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[OUTER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[INNER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[NESTED_IMPLICIT_TASK_ID]], second_task_id=[[EXPLICIT_TASK_ID]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[EXPLICIT_TASK_ID]], second_task_id=[[NESTED_IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[EXPLICIT_TASK_ID]]
+
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[INNER_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[INNER_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[OUTER_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[OUTER_RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/explicit.c b/final/runtime/test/ompt/synchronization/barrier/explicit.c
new file mode 100644
index 0000000..d60acd6
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/explicit.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+
+    #pragma omp barrier
+    print_current_address();
+
+    #pragma omp atomic
+    x++;
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread explicit barrier 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+
+
+  // worker thread explicit barrier 
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/for_loop.c b/final/runtime/test/ompt/synchronization/barrier/for_loop.c
new file mode 100644
index 0000000..5259447
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/for_loop.c
@@ -0,0 +1,56 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int y[] = {0,1,2,3};
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier at end of for loop
+    int i;
+    #pragma omp for
+    for (i = 0; i < 4; i++)
+    {
+      y[i]++;
+    }
+    print_current_address();
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at loop end 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread explicit barrier 
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // worker thread implicit barrier after parallel
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/for_simd.c b/final/runtime/test/ompt/synchronization/barrier/for_simd.c
new file mode 100644
index 0000000..351b2c2
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/for_simd.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// XFAIL: gcc-4
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int y[] = {0,1,2,3};
+
+  int i;
+  #pragma omp for simd
+  for (i = 0; i < 4; i++)
+  {
+    y[i]++;
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at simd loop end 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/implicit_task_data.c b/final/runtime/test/ompt/synchronization/barrier/implicit_task_data.c
new file mode 100644
index 0000000..0824b47
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/implicit_task_data.c
@@ -0,0 +1,150 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+// This test checks that values stored in task_data in a barrier_begin event
+// are still present in the corresponding barrier_end event.
+// Therefore, callback implementations different from the ones in callback.h are neccessary.
+// This is a test for an issue reported in 
+// https://github.com/OpenMPToolsInterface/LLVM-openmp/issues/39
+
+#define _BSD_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <omp.h>
+#include <ompt.h>
+
+static const char* ompt_thread_type_t_values[] = {
+  NULL,
+  "ompt_thread_initial",
+  "ompt_thread_worker",
+  "ompt_thread_other"
+};
+
+static ompt_get_unique_id_t ompt_get_unique_id;
+static ompt_get_thread_data_t ompt_get_thread_data;
+
+int main()
+{
+  #pragma omp parallel num_threads(4)
+  {
+    #pragma omp master
+    {
+      sleep(1);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}}
+
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]]
+
+  return 0;
+}
+
+static void
+on_ompt_callback_thread_begin(
+  ompt_thread_type_t thread_type,
+  ompt_data_t *thread_data)
+{
+  if(thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  thread_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, ompt_thread_type_t_values[thread_type], thread_type, thread_data->value);
+}
+
+static void
+on_ompt_callback_sync_region(
+  ompt_sync_region_kind_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      task_data->value = ompt_get_unique_id();
+      if(kind == ompt_sync_region_barrier)
+        printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      if(kind == ompt_sync_region_barrier)
+        printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region_wait(
+  ompt_sync_region_kind_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      if(kind == ompt_sync_region_barrier)
+          printf("%" PRIu64 ": ompt_event_wait_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      if(kind == ompt_sync_region_barrier)
+        printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+      break;
+  }
+}
+
+#define register_callback_t(name, type)                       \
+do{                                                           \
+  type f_##name = &on_##name;                                 \
+  if (ompt_set_callback(name, (ompt_callback_t)f_##name) ==   \
+      ompt_set_never)                                         \
+    printf("0: Could not register callback '" #name "'\n");   \
+}while(0)
+
+#define register_callback(name) register_callback_t(name, name##_t)
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_data_t *tool_data)
+{
+  ompt_set_callback_t ompt_set_callback;
+  ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+  ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
+  ompt_get_thread_data = (ompt_get_thread_data_t) lookup("ompt_get_thread_data");
+  register_callback(ompt_callback_sync_region);
+  register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback(ompt_callback_thread_begin);
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/parallel_region.c b/final/runtime/test/ompt/synchronization/barrier/parallel_region.c
new file mode 100644
index 0000000..ea0a23f
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/parallel_region.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  //implicit barrier at end of a parallel region
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+  }
+  print_fuzzy_address();
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/sections.c b/final/runtime/test/ompt/synchronization/barrier/sections.c
new file mode 100644
index 0000000..4e1dfdd
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/sections.c
@@ -0,0 +1,63 @@
+// RUN: %libomp-compile-and-run | %sort-threads  | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier after sections with nowait but with lastprivates
+    //implicit barrier at end of sections
+    #pragma omp sections
+    {
+      #pragma omp section 
+      {
+        #pragma omp atomic
+        x++;
+      }
+      
+      #pragma omp section 
+      {
+        #pragma omp atomic
+        x++;
+      }
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at sections end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread implicit barrier at sections end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/single.c b/final/runtime/test/ompt/synchronization/barrier/single.c
new file mode 100644
index 0000000..8ba8b52
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/single.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier at end of single
+    #pragma omp single
+    {
+      x++;
+    }
+    print_fuzzy_address();
+    //critical section to avoid merge of two barriers into one
+    #pragma omp critical
+    {
+      x++;
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at single end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread implicit barrier at single end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/critical.c b/final/runtime/test/ompt/synchronization/critical.c
new file mode 100644
index 0000000..ed982b7
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/critical.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp critical
+  {
+    print_current_address(1);
+    print_ids(0);
+  }
+  print_current_address(2);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_critical: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_critical: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_critical: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/flush.c b/final/runtime/test/ompt/synchronization/flush.c
new file mode 100644
index 0000000..287d035
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/flush.c
@@ -0,0 +1,30 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the flush construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+#pragma omp parallel num_threads(2)
+  {
+    int tid = omp_get_thread_num();
+
+#pragma omp flush
+    print_current_address(1);
+  }
+
+  return 0;
+}
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_flush'
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_flush:
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+// CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+//
+// CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_flush:
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+// CHECK: {{^}}[[THREAD_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
diff --git a/final/runtime/test/ompt/synchronization/lock.c b/final/runtime/test/ompt/synchronization/lock.c
new file mode 100644
index 0000000..eae1575
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/lock.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  //need to use an OpenMP construct so that OMPT will be initalized
+  #pragma omp parallel num_threads(1)
+    print_ids(0);
+
+  omp_lock_t lock;
+  printf("%" PRIu64 ": &lock: %" PRIu64 "\n", ompt_get_thread_data()->value, (uint64_t) &lock);
+  omp_init_lock(&lock);
+  print_fuzzy_address(1);
+  omp_set_lock(&lock);
+  print_fuzzy_address(2);
+  omp_unset_lock(&lock);
+  print_fuzzy_address(3);
+  omp_destroy_lock(&lock);
+  print_fuzzy_address(4);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: &lock: [[WAIT_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_init_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+ 
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/master.c b/final/runtime/test/ompt/synchronization/master.c
new file mode 100644
index 0000000..8cc2d46
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/master.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the master construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp master
+    {
+      print_fuzzy_address(1);
+      x++;
+    }
+    print_current_address(2);
+  }
+
+  printf("%" PRIu64 ": x=%d\n", ompt_get_thread_data()->value, x);
+
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_master_end:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS_END:0x[0-f]+]]
+// CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS_END]]
diff --git a/final/runtime/test/ompt/synchronization/nest_lock.c b/final/runtime/test/ompt/synchronization/nest_lock.c
new file mode 100644
index 0000000..c83ceaf
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/nest_lock.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  //need to use an OpenMP construct so that OMPT will be initalized
+  #pragma omp parallel num_threads(1)
+    print_ids(0);
+
+  omp_nest_lock_t nest_lock;
+  printf("%" PRIu64 ": &nest_lock: %lli\n", ompt_get_thread_data()->value, (long long) &nest_lock);
+  omp_init_nest_lock(&nest_lock);
+  print_fuzzy_address(1);
+  omp_set_nest_lock(&nest_lock);
+  print_fuzzy_address(2);
+  omp_set_nest_lock(&nest_lock);
+  print_fuzzy_address(3);
+  omp_unset_nest_lock(&nest_lock);
+  print_fuzzy_address(4);
+  omp_unset_nest_lock(&nest_lock);
+  print_fuzzy_address(5);
+  omp_destroy_nest_lock(&nest_lock);
+  print_fuzzy_address(6);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_nest_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/ordered.c b/final/runtime/test/ompt/synchronization/ordered.c
new file mode 100644
index 0000000..14284a4
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/ordered.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp ordered
+  {
+    print_current_address(1);
+    print_ids(0);
+  }
+  print_current_address(2);
+  
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_ordered: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_ordered: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_ordered: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/taskgroup.c b/final/runtime/test/ompt/synchronization/taskgroup.c
new file mode 100644
index 0000000..7309c0a
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/taskgroup.c
@@ -0,0 +1,49 @@
+// RUN:  %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#include "callback.h"
+#include <unistd.h>  
+#include <stdio.h>
+
+int main()
+{
+  int condition=0;
+  int x=0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp taskgroup
+      {
+        print_current_address(1);
+        #pragma omp task
+        {
+          #pragma omp atomic
+          x++;
+        }
+      }
+      print_current_address(2);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskgroup_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/taskwait.c b/final/runtime/test/ompt/synchronization/taskwait.c
new file mode 100644
index 0000000..c431024
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/taskwait.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp task
+      {
+        x++;
+      }
+      #pragma omp taskwait
+      print_current_address(1);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskwait_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskwait_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskwait_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: ompt_event_taskwait_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/test_lock.c b/final/runtime/test/ompt/synchronization/test_lock.c
new file mode 100644
index 0000000..d24e4d6
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/test_lock.c
@@ -0,0 +1,54 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_lock_t lock;
+  omp_init_lock(&lock);
+  print_fuzzy_address(1);
+
+  omp_test_lock(&lock);
+  print_fuzzy_address(2);
+  omp_unset_lock(&lock);
+  print_fuzzy_address(3);
+
+  omp_set_lock(&lock);
+  print_fuzzy_address(4);
+  omp_test_lock(&lock);
+  print_fuzzy_address(5);
+  omp_unset_lock(&lock);
+  print_fuzzy_address(6);
+
+  omp_destroy_lock(&lock);
+  print_fuzzy_address(7);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/test_nest_lock.c b/final/runtime/test/ompt/synchronization/test_nest_lock.c
new file mode 100644
index 0000000..ad02d32
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/test_nest_lock.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_nest_lock_t nest_lock;
+  omp_init_nest_lock(&nest_lock);
+
+  omp_test_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+
+  omp_set_nest_lock(&nest_lock);
+  omp_test_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+
+  omp_destroy_nest_lock(&nest_lock);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/test_nest_lock_parallel.c b/final/runtime/test/ompt/synchronization/test_nest_lock_parallel.c
new file mode 100644
index 0000000..e9240f7
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/test_nest_lock_parallel.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_nest_lock_t nest_lock;
+  omp_init_nest_lock(&nest_lock);
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      omp_set_nest_lock(&nest_lock);
+      print_fuzzy_address(1);
+    }
+    #pragma omp barrier
+    omp_test_nest_lock(&nest_lock); //should fail for non-master
+    print_fuzzy_address(2);
+    #pragma omp barrier
+    #pragma omp master
+    {
+      omp_unset_nest_lock(&nest_lock);
+      print_fuzzy_address(3);
+      omp_unset_nest_lock(&nest_lock);
+      print_fuzzy_address(4);
+    }
+  }
+
+  omp_destroy_nest_lock(&nest_lock);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_nest_lock: wait_id=[[WAIT_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]]
+  // CHECK-NEXT: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/dependences.c b/final/runtime/test/ompt/tasks/dependences.c
new file mode 100644
index 0000000..57b61f9
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/dependences.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#include "callback.h"
+#include <omp.h>   
+#include <math.h>
+#include <unistd.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {  
+      print_ids(0);
+      #pragma omp task depend(out:x)
+      {
+        x++;
+        delay(100);
+      }
+      print_fuzzy_address(1);
+      print_ids(0);
+    
+      #pragma omp task depend(in:x)
+      {
+        x = -1;
+      }
+      print_ids(0);
+    }
+  }
+
+  x++;
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependences'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependence'
+  
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[FIRST_TASK:[0-f]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, task_type=ompt_task_explicit=4, has_dependences=yes
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependences: task_id=[[FIRST_TASK]], deps={{0x[0-f]+}}, ndeps=1
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[SECOND_TASK:[0-f]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=yes
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependences: task_id=[[SECOND_TASK]], deps={{0x[0-f]+}}, ndeps=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependence_pair: first_task_id=[[FIRST_TASK]], second_task_id=[[SECOND_TASK]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/explicit_task.c b/final/runtime/test/ompt/tasks/explicit_task.c
new file mode 100644
index 0000000..01fb3f8
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/explicit_task.c
@@ -0,0 +1,102 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      print_fuzzy_address(1);
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    #pragma omp barrier
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // explicit barrier after master
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // this is expected to come earlier and at MASTER:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/serialized.c b/final/runtime/test/ompt/tasks/serialized.c
new file mode 100644
index 0000000..12a0281
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/serialized.c
@@ -0,0 +1,154 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <math.h>
+
+int main() {
+  omp_set_nested(0);
+  print_frame(0);
+#pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+#pragma omp master
+    {
+      print_ids(0);
+      void *creator_frame = get_frame_address(0);
+      int t = (int)sin(0.1);
+#pragma omp task if (t)
+      {
+        void *task_frame = get_frame_address(0);
+        if (creator_frame == task_frame) {
+          // Assume this code was inlined which the compiler is allowed to do.
+          print_frame(0);
+        } else {
+          // The exit frame must be our parent!
+          print_frame_from_outlined_fn(1);
+        }
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_ids(0);
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create
+  // CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]]
+  // CHECK-SAME: parent_task_frame.reenter=[[NULL]]
+  // CHECK-SAME: new_task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_initial=1, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)
+  // CHECK-SAME: =[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]]
+  // CHECK-SAME: parent_task_frame.exit=[[NULL]]
+  // CHECK-SAME: parent_task_frame.reenter=[[MAIN_REENTER]]
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2
+  // CHECK-SAME: codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
+
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address
+  // CHECK-SAME: =[[EXIT:0x[0-f]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1
+  // CHECK-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]]
+  // CHECK-SAME: task_id=[[PARENT_TASK_ID]],
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: parent_task_frame.exit=[[EXIT]]
+  // CHECK-SAME: parent_task_frame.reenter=[[REENTER]]
+  // CHECK-SAME: new_task_id=[[TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule:
+  // CHECK-SAME: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address
+  // CHECK-SAME: =[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]]
+  // CHECK-SAME: exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2
+  // CHECK-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[PARENT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule
+  // CHECK-SAME: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address
+  // CHECK-SAME: =[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1
+  // CHECK-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[PARENT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/task_in_joinbarrier.c b/final/runtime/test/ompt/tasks/task_in_joinbarrier.c
new file mode 100644
index 0000000..25b57a9
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/task_in_joinbarrier.c
@@ -0,0 +1,91 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/task_types.c b/final/runtime/test/ompt/tasks/task_types.c
new file mode 100644
index 0000000..40ceb2d
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/task_types.c
@@ -0,0 +1,222 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+#include <math.h>
+
+int main() {
+  //initialize the OpenMP runtime
+  omp_get_num_threads();
+
+  // initial task
+  print_ids(0);
+
+  int x;
+// implicit task
+#pragma omp parallel num_threads(1)
+  {
+    print_ids(0);
+    x++;
+  }
+
+#pragma omp parallel num_threads(2)
+  {
+// explicit task
+#pragma omp single
+#pragma omp task
+    {
+      print_ids(0);
+      x++;
+    }
+// explicit task with undeferred
+#pragma omp single
+#pragma omp task if (0)
+    {
+      print_ids(0);
+      x++;
+    }
+
+// explicit task with untied
+#pragma omp single
+#pragma omp task untied
+    {
+      // Output of thread_id is needed to know on which thread task is executed
+      printf("%" PRIu64 ": explicit_untied\n", ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
+      x++;
+#pragma omp taskyield
+      printf("%" PRIu64 ": explicit_untied(2)\n",
+             ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
+      x++;
+#pragma omp taskwait
+      printf("%" PRIu64 ": explicit_untied(3)\n",
+             ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
+      x++;
+    }
+// explicit task with final
+#pragma omp single
+#pragma omp task final(1)
+    {
+      print_ids(0);
+      x++;
+// nested explicit task with final and undeferred
+#pragma omp task
+      {
+        print_ids(0);
+        x++;
+      }
+    }
+
+    // Mergeable task test deactivated for now
+    // explicit task with mergeable
+    /*
+    #pragma omp task mergeable if((int)sin(0))
+    {
+      print_ids(0);
+      x++;
+    }
+    */
+
+    // TODO: merged task
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=0
+  // CHECK-SAME: parent_task_frame.exit=[[NULL]]
+  // CHECK-SAME: parent_task_frame.reenter=[[NULL]]
+  // CHECK-SAME: new_task_id=[[INITIAL_TASK_ID:[0-9]+]], codeptr_ra=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_initial=1, has_dependences=no
+
+  // CHECK-NOT: 0: parallel_data initially not null
+
+  // initial task
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id={{[0-9]+}}
+  // CHECK-SAME: task_id=[[INITIAL_TASK_ID]], exit_frame=[[NULL]]
+  // CHECK-SAME: reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_initial=1, thread_num=0
+
+  // implicit task
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id={{[0-9]+}}
+  // CHECK-SAME: task_id={{[0-9]+}}, exit_frame={{0x[0-f]+}}
+  // CHECK-SAME: reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_implicit|ompt_task_undeferred=134217730
+  // CHECK-SAME: thread_num=0
+
+  // explicit task
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_1:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[EXPLICIT_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_1]]: task level 0: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK-SAME: task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}
+  // CHECK-SAME: reenter_frame=[[NULL]], task_type=ompt_task_explicit=4
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // explicit task with undeferred
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_UNDEFERRED_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_2:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[EXPLICIT_UNDEFERRED_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_2]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNDEFERRED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // explicit task with untied
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_UNTIED_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: has_dependences=no
+
+  // Here the thread_id cannot be taken from a schedule event as there
+  // may be multiple of those
+  // CHECK: [[THREAD_ID_3:[0-9]+]]: explicit_untied
+  // CHECK: [[THREAD_ID_3]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // after taskyield
+  // CHECK: [[THREAD_ID_3_2:[0-9]+]]: explicit_untied(2)
+  // CHECK: [[THREAD_ID_3_2]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // after taskwait
+  // CHECK: [[THREAD_ID_3_3:[0-9]+]]: explicit_untied(3)
+  // CHECK: [[THREAD_ID_3_3]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // explicit task with final
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_FINAL_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_final=536870916
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_4:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[EXPLICIT_FINAL_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_4]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_FINAL_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_final=536870916
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // nested explicit task with final and undeferred
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[NESTED_FINAL_UNDEFERRED_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred
+  // CHECK-SAME:|ompt_task_final=671088644
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_5:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[NESTED_FINAL_UNDEFERRED_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_5]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[NESTED_FINAL_UNDEFERRED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred
+  // CHECK-SAME:|ompt_task_final=671088644
+  // CHECK-SAME: thread_num={{[01]}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/task_types_serialized.c b/final/runtime/test/ompt/tasks/task_types_serialized.c
new file mode 100644
index 0000000..7726f5b
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/task_types_serialized.c
@@ -0,0 +1,113 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+__attribute__ ((noinline)) // workaround for bug in icc
+void print_task_type(int id)
+{
+  #pragma omp critical
+  {
+    int task_type;
+    char buffer[2048];
+    ompt_get_task_info(0, &task_type, NULL, NULL, NULL, NULL);
+    format_task_type(task_type, buffer);
+    printf("%" PRIu64 ": id=%d task_type=%s=%d\n", ompt_get_thread_data()->value, id, buffer, task_type);
+  }
+};
+
+int main()
+{
+  //initial task
+  print_task_type(0);
+
+  int x;
+  //implicit task
+  #pragma omp parallel num_threads(1)
+  {
+    print_task_type(1);
+    x++;
+  }
+
+  #pragma omp parallel num_threads(1)
+  #pragma omp master
+  {
+    //explicit task
+    #pragma omp task
+    {
+      print_task_type(2);
+      x++;
+    }
+
+    //explicit task with undeferred
+    #pragma omp task if(0)
+    {
+      print_task_type(3);
+      x++;
+    }
+
+    //explicit task with untied
+    #pragma omp task untied
+    {
+      print_task_type(4);
+      x++;
+    }
+
+    //explicit task with final
+    #pragma omp task final(1)
+    {
+      print_task_type(5);
+      x++;
+      //nested explicit task with final and undeferred
+      #pragma omp task
+      {
+        print_task_type(6);
+        x++;
+      }
+    }
+
+/*
+    //TODO:not working
+    //explicit task with mergeable
+    #pragma omp task mergeable
+    {
+      print_task_type(7);
+      x++;
+    }
+*/
+
+    //TODO: merged task
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=0, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id={{[0-9]+}}, codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: id=0 task_type=ompt_task_initial=1
+  // CHECK: {{^}}[[MASTER_ID]]: id=1 task_type=ompt_task_implicit|ompt_task_undeferred=134217730
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=2 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=3 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_untied=402653188, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=4 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_untied=402653188
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=5 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=6 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644
+
+  // ___CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // ___CHECK: {{^[0-9]+}}: id=7 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/taskloop.c b/final/runtime/test/ompt/tasks/taskloop.c
new file mode 100644
index 0000000..59a47bf
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/taskloop.c
@@ -0,0 +1,81 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | FileCheck --check-prefix=TASKS %s
+// REQUIRES: ompt
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+// GCC 6 has support for taskloops, but at least 6.3.0 is crashing on this test
+// UNSUPPORTED: gcc-6
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  unsigned int i, x;
+
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp barrier
+
+#pragma omp master
+#pragma omp taskloop
+    for (i = 0; i < 5; i += 3) {
+      x++;
+    }
+  }
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK-SAME: requested_team_size=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID1:[0-9]+]]
+  // CHECK-SAME: team_size=2, thread_num=0
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskloop_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]], count=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: new_task_id=[[TASK_ID1:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: new_task_id=[[TASK_ID2:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK-NOT: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskloop_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: count=2
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_begin:
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID1]], team_size=2, thread_num=0
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+
+  // TASKS: ompt_event_task_create:{{.*}} new_task_id={{[0-9]+}}
+  // TASKS-SAME: task_type=ompt_task_initial
+  // TASKS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskloop_begin:
+  // TASKS: ompt_event_task_create:{{.*}} new_task_id=[[TASK_ID1:[0-9]+]]
+  // TASKS-SAME: task_type=ompt_task_explicit
+  // TASKS-DAG: ompt_event_task_create:{{.*}} new_task_id=[[TASK_ID2:[0-9]+]]
+  // Schedule events:
+  // TASKS-DAG: {{^.*}}first_task_id={{[0-9]+}}, second_task_id=[[TASK_ID1]]
+  // TASKS-DAG: {{^.*}}first_task_id=[[TASK_ID1]], second_task_id={{[0-9]+}}
+  // TASKS-DAG: {{^.*}}first_task_id={{[0-9]+}}, second_task_id=[[TASK_ID2]]
+  // TASKS-DAG: {{^.*}}first_task_id=[[TASK_ID2]], second_task_id={{[0-9]+}}
+  // TASKS-NOT: ompt_event_task_schedule
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/taskyield.c b/final/runtime/test/ompt/tasks/taskyield.c
new file mode 100644
index 0000000..56a4697
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/taskyield.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implements taskyield as stub
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>   
+#include <unistd.h>
+
+int main()
+{
+  int condition=0, x=0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+        #pragma omp task shared(condition)
+        {
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+        }
+        OMPT_WAIT(condition,1);
+        #pragma omp task shared(x)
+        {
+          x++;
+        }
+        printf("%" PRIu64 ": before yield\n", ompt_get_thread_data()->value);
+        #pragma omp taskyield
+        printf("%" PRIu64 ": after yield\n", ompt_get_thread_data()->value);
+        OMPT_SIGNAL(condition);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[0-9]+}}, thread_num={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[WORKER_TASK:[0-9]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[MAIN_TASK:[0-9]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=no
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[MAIN_TASK]], prior_task_status=ompt_task_yield=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[MAIN_TASK]], second_task_id=[[IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_task_schedule: first_task_id={{[0-9]+}}, second_task_id=[[WORKER_TASK]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[WORKER_TASK]], second_task_id={{[0-9]+}}, prior_task_status=ompt_task_complete=1
+
+
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/untied_task.c b/final/runtime/test/ompt/tasks/untied_task.c
new file mode 100644
index 0000000..e68fa26
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/untied_task.c
@@ -0,0 +1,108 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task untied shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        #pragma omp task if(0)
+        {
+          print_ids(0);
+          print_ids(1);
+          print_ids(2);
+        }
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    #pragma omp barrier
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // explicit barrier after master
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // this is expected to come earlier and at MASTER:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/auto.c b/final/runtime/test/ompt/worksharing/for/auto.c
new file mode 100644
index 0000000..17d26f5
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/auto.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for auto = static schedule
+// XFAIL: gcc
+
+#define SCHEDULE auto
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/auto_serialized.c b/final/runtime/test/ompt/worksharing/for/auto_serialized.c
new file mode 100644
index 0000000..f756166
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/auto_serialized.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for auto = static schedule
+// XFAIL: gcc
+
+#define SCHEDULE auto
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/auto_split.c b/final/runtime/test/ompt/worksharing/for/auto_split.c
new file mode 100644
index 0000000..d82e3fd
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/auto_split.c
@@ -0,0 +1,8 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for auto = static schedule
+// XFAIL: gcc
+
+#define SCHEDULE auto
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/base.h b/final/runtime/test/ompt/worksharing/for/base.h
new file mode 100644
index 0000000..8a496d9
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/base.h
@@ -0,0 +1,43 @@
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  unsigned int i;
+
+  #pragma omp parallel for num_threads(4) schedule(SCHEDULE)
+  for (i = 0; i < 4; i++) {
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/base_serialized.h b/final/runtime/test/ompt/worksharing/for/base_serialized.h
new file mode 100644
index 0000000..3376b37
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/base_serialized.h
@@ -0,0 +1,28 @@
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  unsigned int i;
+
+  #pragma omp parallel for num_threads(1) schedule(SCHEDULE)
+  for (i = 0; i < 1; i++) {
+  }
+  
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[PARALLEL_ID,0]}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/base_split.h b/final/runtime/test/ompt/worksharing/for/base_split.h
new file mode 100644
index 0000000..0f1fed3
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/base_split.h
@@ -0,0 +1,66 @@
+#include "callback.h"
+#include <omp.h>
+
+/* With the combined parallel-for construct (base.h), the return-addresses are hard to compare.
+   With the separate parallel and for-nowait construct, the addresses become more predictable,
+   but the begin of the for-loop still generates additional code, so the offset of loop-begin 
+   to the label is >4 Byte.
+*/
+
+int main()
+{
+  unsigned int i;
+
+  #pragma omp parallel num_threads(4) 
+  {
+    print_current_address(0);
+    #pragma omp for schedule(SCHEDULE) nowait
+    for (i = 0; i < 4; i++) {
+      print_fuzzy_address(1);
+    }
+    print_fuzzy_address(2);
+  }
+  print_fuzzy_address(3);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[PARALLEL_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=[[LOOP_BEGIN_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, invoker={{[0-9]+}}, codeptr_ra=[[PARALLEL_RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[PARALLEL_RETURN_ADDRESS]]
+  
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+
+  // CHECK-LOOP: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK-LOOP: 0: ompt_event_runtime_shutdown
+  // CHECK-LOOP: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra={{0x[0-f]+}}, invoker={{[0-9]+}}
+  // CHECK-LOOP: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=[[LOOP_BEGIN_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/dynamic.c b/final/runtime/test/ompt/worksharing/for/dynamic.c
new file mode 100644
index 0000000..ca5ae10
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/dynamic.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+
+#define SCHEDULE dynamic
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/dynamic_serialized.c b/final/runtime/test/ompt/worksharing/for/dynamic_serialized.c
new file mode 100644
index 0000000..0f80929
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/dynamic_serialized.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+
+#define SCHEDULE dynamic
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/dynamic_split.c b/final/runtime/test/ompt/worksharing/for/dynamic_split.c
new file mode 100644
index 0000000..cf14971
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/dynamic_split.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#define SCHEDULE dynamic
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/guided.c b/final/runtime/test/ompt/worksharing/for/guided.c
new file mode 100644
index 0000000..01bff4e
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/guided.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+
+#define SCHEDULE guided
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/guided_serialized.c b/final/runtime/test/ompt/worksharing/for/guided_serialized.c
new file mode 100644
index 0000000..4b5096d
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/guided_serialized.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+
+#define SCHEDULE guided
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/guided_split.c b/final/runtime/test/ompt/worksharing/for/guided_split.c
new file mode 100644
index 0000000..7d560c2
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/guided_split.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#define SCHEDULE guided
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/runtime.c b/final/runtime/test/ompt/worksharing/for/runtime.c
new file mode 100644
index 0000000..bcf160f
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/runtime.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+
+#define SCHEDULE runtime
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/runtime_serialized.c b/final/runtime/test/ompt/worksharing/for/runtime_serialized.c
new file mode 100644
index 0000000..231d67d
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/runtime_serialized.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+
+#define SCHEDULE runtime
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/runtime_split.c b/final/runtime/test/ompt/worksharing/for/runtime_split.c
new file mode 100644
index 0000000..7a677ed
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/runtime_split.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#define SCHEDULE runtime
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/static.c b/final/runtime/test/ompt/worksharing/for/static.c
new file mode 100644
index 0000000..4d99059
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/static.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for static schedule
+// XFAIL: gcc
+
+#define SCHEDULE static
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/static_serialized.c b/final/runtime/test/ompt/worksharing/for/static_serialized.c
new file mode 100644
index 0000000..4860d49
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/static_serialized.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for static schedule
+// XFAIL: gcc
+
+#define SCHEDULE static
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/static_split.c b/final/runtime/test/ompt/worksharing/for/static_split.c
new file mode 100644
index 0000000..d8c88dd
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/static_split.c
@@ -0,0 +1,8 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for static schedule
+// XFAIL: gcc
+
+#define SCHEDULE static
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/sections.c b/final/runtime/test/ompt/worksharing/sections.c
new file mode 100644
index 0000000..bafb743
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/sections.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Some compilers generate code that does not distinguish between sections and loops
+// XFAIL: gcc, clang-3, clang-4, clang-5, icc-16, icc-17
+// UNSUPPORTED: icc-18
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel sections num_threads(2)
+  {
+    #pragma omp section
+    {
+      printf("%lu: section 1\n", ompt_get_thread_data()->value);
+    }
+    #pragma omp section
+    {
+      printf("%lu: section 2\n", ompt_get_thread_data()->value);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_sections_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[SECT_BEGIN:0x[0-f]+]], count=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_sections_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[SECT_END:0x[0-f]+]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_sections_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[SECT_BEGIN]], count=2
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_sections_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[SECT_END]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/single.c b/final/runtime/test/ompt/worksharing/single.c
new file mode 100644
index 0000000..6b24f2d
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/single.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the single construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp single
+    {
+      printf("%" PRIu64 ": in single\n", ompt_get_thread_data()->value);
+      x++;
+    }
+  }
+
+  printf("x=%d\n", x);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID_1:[0-9]+]]: ompt_event_single_in_block_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]+}}, count=1
+  // CHECK-DAG: {{^}}[[THREAD_ID_1]]: in single
+  // CHECK-DAG: {{^}}[[THREAD_ID_1]]: ompt_event_single_in_block_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]+}}, count=1
+
+  // CHECK-DAG: {{^}}[[THREAD_ID_2:[0-9]+]]: ompt_event_single_others_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]+}}, count=1
+  // CHECK-DAG: {{^}}[[THREAD_ID_2]]: ompt_event_single_others_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]+}}, count=1
+
+  return 0;
+}
diff --git a/final/runtime/test/parallel/omp_nested.c b/final/runtime/test/parallel/omp_nested.c
new file mode 100644
index 0000000..8b78088
--- /dev/null
+++ b/final/runtime/test/parallel/omp_nested.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * Test if the compiler supports nested parallelism
+ * By Chunhua Liao, University of Houston
+ * Oct. 2005
+ */
+int test_omp_nested()
+{
+#ifdef _OPENMP
+  if (omp_get_max_threads() > 4)
+    omp_set_num_threads(4);
+#endif
+
+  int counter = 0;
+#ifdef _OPENMP
+  omp_set_nested(1);
+#endif
+
+  #pragma omp parallel shared(counter)
+  {
+    #pragma omp critical
+    counter++;
+    #pragma omp parallel
+    {
+      #pragma omp critical
+      counter--;
+    }
+  }
+  return (counter != 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_nested()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_copyin.c b/final/runtime/test/parallel/omp_parallel_copyin.c
new file mode 100644
index 0000000..600f9b7
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_copyin.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+static int sum1 = 789;
+#pragma omp threadprivate(sum1)
+
+int test_omp_parallel_copyin()
+{
+  int sum, num_threads;
+  int known_sum;
+
+  sum = 0;
+  sum1 = 7;
+  num_threads = 0;
+
+  #pragma omp parallel copyin(sum1)
+  {
+    /*printf("sum1=%d\n",sum1);*/
+    int i;
+    #pragma omp for
+    for (i = 1; i < 1000; i++) {
+      sum1 = sum1 + i;
+    } /*end of for*/
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+      num_threads++;
+    } /*end of critical*/
+  } /* end of parallel*/
+  known_sum = (999 * 1000) / 2 + 7 * num_threads;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_copyin()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_default.c b/final/runtime/test/parallel/omp_parallel_default.c
new file mode 100644
index 0000000..0a8e09e
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_default.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_default()
+{
+  int i;
+  int sum;
+  int mysum;
+  int known_sum;
+  sum =0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+
+  #pragma omp parallel default(shared) private(i) private(mysum)
+  {
+    mysum = 0;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      mysum = mysum + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + mysum;
+    }   /* end of critical */
+  }   /* end of parallel */
+  if (known_sum != sum) {
+    fprintf(stderr, "KNOWN_SUM = %d; SUM = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_default()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_firstprivate.c b/final/runtime/test/parallel/omp_parallel_firstprivate.c
new file mode 100644
index 0000000..dbee76c
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_firstprivate.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+//static int sum1 = 789;
+
+int test_omp_parallel_firstprivate()
+{
+  int sum, num_threads,sum1;
+  int known_sum;
+
+  sum = 0;
+  sum1=7;
+  num_threads = 0;
+
+  #pragma omp parallel firstprivate(sum1)
+  {
+    /*printf("sum1=%d\n",sum1);*/
+    int i;
+    #pragma omp for
+    for (i = 1; i < 1000; i++) {
+      sum1 = sum1 + i;
+    } /*end of for*/
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+      num_threads++;
+    } /*end of critical*/
+  } /* end of parallel*/
+  known_sum = (999 * 1000) / 2 + 7 * num_threads;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_if.c b/final/runtime/test/parallel/omp_parallel_if.c
new file mode 100644
index 0000000..abbf3cd
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_if.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_if()
+{
+  int i;
+  int sum;
+  int known_sum;
+  int mysum;
+  int control=1;
+
+  sum =0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+  #pragma omp parallel private(i) if(control==0)
+  {
+    mysum = 0;
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      mysum = mysum + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + mysum;
+    }
+  }
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_num_threads.c b/final/runtime/test/parallel/omp_parallel_num_threads.c
new file mode 100644
index 0000000..8af1f9d
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_num_threads.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_num_threads()
+{
+  int num_failed;
+  int threads;
+  int nthreads;
+  int max_threads = 0;
+
+  num_failed = 0;
+
+  /* first we check how many threads are available */
+  #pragma omp parallel
+  {
+    #pragma omp master
+    max_threads = omp_get_num_threads ();
+  }
+
+  /* we increase the number of threads from one to maximum:*/
+  for(threads = 1; threads <= max_threads; threads++) {
+    nthreads = 0;
+    #pragma omp parallel reduction(+:num_failed) num_threads(threads)
+    {
+      num_failed = num_failed + !(threads == omp_get_num_threads());
+      #pragma omp atomic
+      nthreads += 1;
+    }
+    num_failed = num_failed + !(nthreads == threads);
+  }
+  return (!num_failed);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_num_threads()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_private.c b/final/runtime/test/parallel/omp_parallel_private.c
new file mode 100644
index 0000000..238e806
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_private.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+//static int sum1 = 789;
+
+int test_omp_parallel_private()
+{
+  int sum, num_threads,sum1;
+  int known_sum;
+
+  sum = 0;
+  num_threads = 0;
+
+  #pragma omp parallel private(sum1)
+  {
+    int i;
+    sum1 = 7;
+    /*printf("sum1=%d\n",sum1);*/
+    #pragma omp for
+    for (i = 1; i < 1000; i++) {
+      sum1 = sum1 + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+      num_threads++;
+    }
+  }
+  known_sum = (999 * 1000) / 2 + 7 * num_threads;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_reduction.c b/final/runtime/test/parallel/omp_parallel_reduction.c
new file mode 100644
index 0000000..bb00939
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_reduction.c
@@ -0,0 +1,254 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20    /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_parallel_reduction()
+{
+  int sum;
+  int known_sum;
+  double dsum;
+  double dknown_sum;
+  double dt=0.5; /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-9;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[LOOPCOUNT];
+  int i;
+  double dpt;
+  int result;
+
+  sum =0;
+  dsum=0;
+  product=1;
+  logic_and=1;
+  logic_or=0;
+  bit_and=1;
+  bit_or=0;
+  exclusiv_bit_or=0;
+  result=0;
+  dt = 1./3.;
+  known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+
+  /* Tests for integers */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:sum)
+  for (i=1;i<=LOOPCOUNT;i++) {
+    sum=sum+i;
+  }
+
+  if(known_sum!=sum) {
+    result++;
+    fprintf(stderr,"Error in sum with integers: Result was %d instead of %d\n",sum,known_sum);
+  }
+
+  diff = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:diff)
+  for (i=1;i<=LOOPCOUNT;++i) {
+    diff=diff-i;
+  }
+
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in difference with integers: Result was %d instead of 0.\n",diff);
+  }
+
+  /* Tests for doubles */
+  dsum=0;
+  dpt=1;
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:dsum)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dsum += pow(dt,i);
+  }
+
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f instead of %f (Difference: %E)\n",dsum,dknown_sum, dsum-dknown_sum);
+  }
+
+  dpt=1;
+
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:ddiff)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    ddiff -= pow(dt,i);
+  }
+  if( fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E instead of 0.0\n",ddiff);
+  }
+
+  /* Tests for product of integers */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(*:product)
+  for(i=1;i<=MAX_FACTOR;i++) {
+    product *= i;
+  }
+
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d instead of %d\n\n",product,known_product);
+  }
+
+  /* Tests for logical and */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = (logic_and && logics[i]);
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1.\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = logic_and && logics[i];
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2.\n");
+  }
+
+  /* Tests for logical or */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 1.\n");
+  }
+  logic_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2.\n");
+  }
+
+  /* Tests for bitwise and */
+  for(i=0;i<LOOPCOUNT;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = (bit_and & logics[i]);
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1.\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = bit_and & logics[i];
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2.\n");
+  }
+
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  /* Tests for bitwise or */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  /* Tests for bitwise xor */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_shared.c b/final/runtime/test/parallel/omp_parallel_shared.c
new file mode 100644
index 0000000..3146ca6
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_shared.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_shared()
+{
+  int i;
+  int sum;
+  int known_sum;
+
+  sum = 0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+
+  #pragma omp parallel private(i) shared(sum)
+  {
+
+    int mysum = 0;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      mysum = mysum + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + mysum;
+    }
+
+
+  }
+  if (known_sum != sum) {
+    fprintf(stderr, "KNOWN_SUM = %d; SUM = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_shared()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/bug_36720.c b/final/runtime/test/tasking/bug_36720.c
new file mode 100644
index 0000000..684d675
--- /dev/null
+++ b/final/runtime/test/tasking/bug_36720.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run
+
+/*
+Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=36720
+
+Assertion failure at kmp_runtime.cpp(1715): nthreads > 0.
+OMP: Error #13: Assertion failure at kmp_runtime.cpp(1715).
+
+The assertion fails even with OMP_NUM_THREADS=1. If the second task is removed,
+everything runs to completion. If the "omp parallel for" directives are removed
+from inside the tasks, once again everything runs fine.
+*/
+
+#define N 1024
+
+int main() {
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; i++)
+      (void)0;
+  }
+
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; ++i)
+      (void)0;
+  }
+
+  #pragma omp taskwait
+
+  return 0;
+}
diff --git a/final/runtime/test/tasking/bug_nested_proxy_task.c b/final/runtime/test/tasking/bug_nested_proxy_task.c
new file mode 100644
index 0000000..6c00822
--- /dev/null
+++ b/final/runtime/test/tasking/bug_nested_proxy_task.c
@@ -0,0 +1,131 @@
+// RUN: %libomp-compile -lpthread && %libomp-run
+// The runtime currently does not get dependency information from GCC.
+// UNSUPPORTED: gcc
+
+#include <stdio.h>
+#include <omp.h>
+#include <pthread.h>
+#include "omp_my_sleep.h"
+
+/*
+ With task dependencies one can generate proxy tasks from an explicit task
+ being executed by a serial task team. The OpenMP runtime library didn't
+ expect that and tries to free the explicit task that is the parent of the
+ proxy task still working in background. It therefore has incomplete children
+ which triggers a debugging assertion.
+*/
+
+// Compiler-generated code (emulation)
+typedef long kmp_intptr_t;
+typedef int kmp_int32;
+
+typedef char bool;
+
+typedef struct ident {
+    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
+    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
+    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+                            /*  but currently used for storing region-specific ITT */
+                            /*  contextual information. */
+#endif /* USE_ITT_BUILD */
+    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
+    char const *psource;    /**< String describing the source location.
+                            The string is composed of semi-colon separated fields which describe the source file,
+                            the function and a pair of line numbers that delimit the construct.
+                             */
+} ident_t;
+
+typedef struct kmp_depend_info {
+     kmp_intptr_t               base_addr;
+     size_t                     len;
+     struct {
+         bool                   in:1;
+         bool                   out:1;
+     } flags;
+} kmp_depend_info_t;
+
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+
+typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
+    void *              shareds;            /**< pointer to block of pointers to shared vars   */
+    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
+    kmp_int32           part_id;            /**< part id for the task                          */
+} kmp_task_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+kmp_int32  __kmpc_global_thread_num  ( ident_t * );
+kmp_task_t*
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry );
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+kmp_int32 __kmpc_omp_task_with_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                      kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                      kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+#ifdef __cplusplus
+}
+#endif
+
+void *target(void *task)
+{
+    my_sleep( 0.1 );
+    __kmpc_proxy_task_completed_ooo((kmp_task_t*) task);
+    return NULL;
+}
+
+pthread_t target_thread;
+
+// User's code
+int task_entry(kmp_int32 gtid, kmp_task_t *task)
+{
+    pthread_create(&target_thread, NULL, &target, task);
+    return 0;
+}
+
+int main()
+{
+    int dep;
+
+#pragma omp taskgroup
+{
+/*
+ *  Corresponds to:
+    #pragma omp target nowait depend(out: dep)
+    {
+        my_sleep( 0.1 );
+    }
+*/
+    kmp_depend_info_t dep_info;
+    dep_info.base_addr = (long) &dep;
+    dep_info.len = sizeof(int);
+    // out = inout per spec and runtime expects this
+    dep_info.flags.in = 1;
+    dep_info.flags.out = 1;
+
+    kmp_int32 gtid = __kmpc_global_thread_num(NULL);
+    kmp_task_t *proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+    __kmpc_omp_task_with_deps(NULL,gtid,proxy_task,1,&dep_info,0,NULL);
+
+    #pragma omp task depend(in: dep)
+    {
+/*
+ *      Corresponds to:
+        #pragma omp target nowait
+        {
+            my_sleep( 0.1 );
+        }
+*/
+        kmp_task_t *nested_proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+        __kmpc_omp_task(NULL,gtid,nested_proxy_task);
+    }
+}
+
+    // only check that it didn't crash
+    return 0;
+}
diff --git a/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c b/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c
new file mode 100644
index 0000000..e6dd895
--- /dev/null
+++ b/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c
@@ -0,0 +1,134 @@
+// RUN: %libomp-compile -lpthread && %libomp-run
+// The runtime currently does not get dependency information from GCC.
+// UNSUPPORTED: gcc
+
+#include <stdio.h>
+#include <omp.h>
+#include <pthread.h>
+#include "omp_my_sleep.h"
+
+/*
+ An explicit task can have a dependency on a target task. If it is not
+ directly satisfied, the runtime should not wait but resume execution.
+*/
+
+// Compiler-generated code (emulation)
+typedef long kmp_intptr_t;
+typedef int kmp_int32;
+
+typedef char bool;
+
+typedef struct ident {
+    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
+    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
+    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+                            /*  but currently used for storing region-specific ITT */
+                            /*  contextual information. */
+#endif /* USE_ITT_BUILD */
+    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
+    char const *psource;    /**< String describing the source location.
+                            The string is composed of semi-colon separated fields which describe the source file,
+                            the function and a pair of line numbers that delimit the construct.
+                             */
+} ident_t;
+
+typedef struct kmp_depend_info {
+     kmp_intptr_t               base_addr;
+     size_t                     len;
+     struct {
+         bool                   in:1;
+         bool                   out:1;
+     } flags;
+} kmp_depend_info_t;
+
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+
+typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
+    void *              shareds;            /**< pointer to block of pointers to shared vars   */
+    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
+    kmp_int32           part_id;            /**< part id for the task                          */
+} kmp_task_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+kmp_int32  __kmpc_global_thread_num  ( ident_t * );
+kmp_task_t*
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry );
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+kmp_int32 __kmpc_omp_task_with_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                      kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                      kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+#ifdef __cplusplus
+}
+#endif
+
+void *target(void *task)
+{
+    my_sleep( 0.1 );
+    __kmpc_proxy_task_completed_ooo((kmp_task_t*) task);
+    return NULL;
+}
+
+pthread_t target_thread;
+
+// User's code
+int task_entry(kmp_int32 gtid, kmp_task_t *task)
+{
+    pthread_create(&target_thread, NULL, &target, task);
+    return 0;
+}
+
+int main()
+{
+    int dep;
+
+/*
+ *  Corresponds to:
+    #pragma omp target nowait depend(out: dep)
+    {
+        my_sleep( 0.1 );
+    }
+*/
+    kmp_depend_info_t dep_info;
+    dep_info.base_addr = (long) &dep;
+    dep_info.len = sizeof(int);
+    // out = inout per spec and runtime expects this
+    dep_info.flags.in = 1;
+    dep_info.flags.out = 1;
+
+    kmp_int32 gtid = __kmpc_global_thread_num(NULL);
+    kmp_task_t *proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+    __kmpc_omp_task_with_deps(NULL,gtid,proxy_task,1,&dep_info,0,NULL);
+
+    int first_task_finished = 0;
+    #pragma omp task shared(first_task_finished) depend(inout: dep)
+    {
+        first_task_finished = 1;
+    }
+
+    int second_task_finished = 0;
+    #pragma omp task shared(second_task_finished) depend(in: dep)
+    {
+        second_task_finished = 1;
+    }
+
+    // check that execution has been resumed and the runtime has not waited
+    // for the dependencies to be satisfied.
+    int error = (first_task_finished == 1);
+    error += (second_task_finished == 1);
+
+    #pragma omp taskwait
+
+    // by now all tasks should have finished
+    error += (first_task_finished != 1);
+    error += (second_task_finished != 1);
+
+    return error;
+}
diff --git a/final/runtime/test/tasking/bug_serial_taskgroup.c b/final/runtime/test/tasking/bug_serial_taskgroup.c
new file mode 100644
index 0000000..850bc90
--- /dev/null
+++ b/final/runtime/test/tasking/bug_serial_taskgroup.c
@@ -0,0 +1,16 @@
+// RUN: %libomp-compile-and-run
+
+/*
+ GCC failed this test because __kmp_get_gtid() instead of __kmp_entry_gtid()
+ was called in xexpand(KMP_API_NAME_GOMP_TASKGROUP_START)(void).
+ __kmp_entry_gtid() will initialize the runtime if not yet done which does not
+ happen with __kmp_get_gtid().
+ */
+
+int main()
+{
+    #pragma omp taskgroup
+    { }
+
+    return 0;
+}
diff --git a/final/runtime/test/tasking/kmp_task_reduction_nest.cpp b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
new file mode 100644
index 0000000..63dffe4
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
@@ -0,0 +1,376 @@
+// RUN: %libomp-cxx-compile-and-run
+// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
+// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
+// XFAIL: gcc-4
+#include <cstdio>
+#include <cmath>
+#include <cassert>
+#include <omp.h>
+
+// Total number of loop iterations, should be multiple of T for this test
+#define N 10000
+
+// Flag to request lazy (1) or eager (0) allocation of reduction objects
+#ifndef FLG
+#define FLG 0
+#endif
+
+/*
+  // initial user's code that corresponds to pseudo code of the test
+  #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
+  {
+    for( int l = 0; l < N; ++l ) {
+      #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
+      {
+        i += l;
+        if( l%2 )
+          x *= 1.0 / (l + 1);
+        else
+          x *= (l + 1);
+      }
+    }
+
+    #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
+    {
+      for( int l = 0; l < N; ++l ) {
+        #pragma omp task firstprivate(l) in_reduction(+:j,y) \
+            in_reduction(*:x) in_reduction(-:k)
+        {
+          j += l;
+          k -= l;
+          y += (double)l;
+          if( l%2 )
+            x *= 1.0 / (l + 1);
+          else
+            x *= (l + 1);
+        }
+        #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
+        {
+          i -= l;
+          k -= l;
+          y += (double)l;
+        }
+        #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
+        {
+          j += l;
+          if( l%2 )
+            x *= 1.0 / (l + 1);
+          else
+            x *= (l + 1);
+        }
+      }
+    } // inner reduction
+
+    for( int l = 0; l < N; ++l ) {
+      #pragma omp task firstprivate(l) in_reduction(+:j)
+        j += l;
+    }
+  } // outer reduction
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
+extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
+extern int __kmpc_global_thread_num(void*);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct _task_red_item {
+    void       *shar; // shared reduction item
+    size_t      size; // size of data item
+    void       *f_init; // data initialization routine
+    void       *f_fini; // data finalization routine
+    void       *f_comb; // data combiner routine
+    unsigned    flags;
+} _task_red_item_t;
+
+// int:+   no need in init/fini callbacks, valid for subtraction
+void __red_int_add_comb(void *lhs, void *rhs) // combiner
+{ *(int*)lhs += *(int*)rhs; }
+
+// long long:+   no need in init/fini callbacks, valid for subtraction
+void __red_llong_add_comb(void *lhs, void *rhs) // combiner
+{ *(long long*)lhs += *(long long*)rhs; }
+
+// double:*   no need in fini callback
+void __red_dbl_mul_init(void *data) // initializer
+{ *(double*)data = 1.0; }
+void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs *= *(double*)rhs; }
+
+// double:+   no need in init/fini callbacks
+void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs += *(double*)rhs; }
+
+// ==============================
+
+void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
+{
+    for( int l = 0; l < N; ++l ) {
+        *pi += l;
+        if( l%2 )
+          *px *= 1.0 / (l + 1);
+        else
+          *px *= (l + 1);
+    }
+    for( int l = 0; l < N; ++l ) {
+        *pj += l;
+        *pk -= l;
+        *py += (double)l;
+        if( l%2 )
+            *px *= 1.0 / (l + 1);
+        else
+            *px *= (l + 1);
+
+        *pi -= l;
+        *pk -= l;
+        *py += (double)l;
+
+        *pj += l;
+        if( l%2 )
+            *px *= 1.0 / (l + 1);
+        else
+            *px *= (l + 1);
+    }
+    for( int l = 0; l < N; ++l ) {
+        *pj += l;
+    }
+}
+
+//------------------------------------------------
+// Test case
+int main()
+{
+  int nthreads = omp_get_max_threads();
+  int err = 0;
+  void** ptrs = (void**)malloc(nthreads*sizeof(void*));
+
+  // user's code ======================================
+  // variables for serial calculations:
+  int is = 3;
+  long long js = -9999999;
+  double xs = 99999.0;
+  long long ks = 99999999;
+  double ys = -99999999.0;
+  // variables for parallel calculations:
+  int ip = 3;
+  long long jp = -9999999;
+  double xp = 99999.0;
+  long long kp = 99999999;
+  double yp = -99999999.0;
+
+  calc_serial(&is, &js, &xs, &ks, &ys);
+  // ==================================================
+  for (int i = 0; i < nthreads; ++i)
+    ptrs[i] = NULL;
+  #pragma omp parallel
+  {
+    #pragma omp single nowait
+    {
+      // outer taskgroup reduces (i,j,x)
+      #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
+      {
+        _task_red_item_t red_data[3];
+        red_data[0].shar = &ip;
+        red_data[0].size = sizeof(ip);
+        red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+        red_data[0].f_fini = NULL; // no destructors needed
+        red_data[0].f_comb = (void*)&__red_int_add_comb;
+        red_data[0].flags = FLG;
+        red_data[1].shar = &jp;
+        red_data[1].size = sizeof(jp);
+        red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+        red_data[1].f_fini = NULL; // no destructors needed
+        red_data[1].f_comb = (void*)&__red_llong_add_comb;
+        red_data[1].flags = FLG;
+        red_data[2].shar = &xp;
+        red_data[2].size = sizeof(xp);
+        red_data[2].f_init = (void*)&__red_dbl_mul_init;
+        red_data[2].f_fini = NULL; // no destructors needed
+        red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
+        red_data[2].flags = FLG;
+        int gtid = __kmpc_global_thread_num(NULL);
+        void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+        for( int l = 0; l < N; l += 2 ) {
+          // 2 iterations per task to get correct x value; actually any even
+          // number of iters per task will work, otherwise x looses precision
+          #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
+          {
+            int gtid = __kmpc_global_thread_num(NULL);
+            int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
+            double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                        gtid, tg1, &xp);
+            if (!ptrs[gtid]) ptrs[gtid] = p_xp;
+
+            // user's pseudo-code ==============================
+            *p_ip += l;
+            *p_xp *= (l + 1);
+
+            *p_ip += l + 1;
+            *p_xp *= 1.0 / (l + 2);
+            // ==================================================
+          }
+        }
+        // inner taskgroup reduces (i,k,y), i is same object as in outer one
+        #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
+        {
+          _task_red_item_t red_data[3];
+          red_data[0].shar = &ip;
+          red_data[0].size = sizeof(ip);
+          red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[0].f_fini = NULL; // no destructors needed
+          red_data[0].f_comb = (void*)&__red_int_add_comb;
+          red_data[0].flags = FLG;
+          red_data[1].shar = &kp;
+          red_data[1].size = sizeof(kp);
+          red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[1].f_fini = NULL; // no destructors needed
+          red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
+          red_data[1].flags = FLG;
+          red_data[2].shar = &yp;
+          red_data[2].size = sizeof(yp);
+          red_data[2].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[2].f_fini = NULL; // no destructors needed
+          red_data[2].f_comb = (void*)&__red_dbl_add_comb;
+          red_data[2].flags = FLG;
+          int gtid = __kmpc_global_thread_num(NULL);
+          void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+          for( int l = 0; l < N; l += 2 ) {
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg1, &jp);
+              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg2, &kp);
+              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg1, &xp);
+              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg2, &yp);
+              // user's pseudo-code ==============================
+              *p_jp += l;
+              *p_kp -= l;
+              *p_yp += (double)l;
+              *p_xp *= (l + 1);
+
+              *p_jp += l + 1;
+              *p_kp -= l + 1;
+              *p_yp += (double)(l + 1);
+              *p_xp *= 1.0 / (l + 2);
+              // =================================================
+{
+  // the following code is here just to check __kmpc_task_reduction_get_th_data:
+  int tid = omp_get_thread_num();
+  void *addr1;
+  void *addr2;
+  addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
+  addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
+  if (addr1 != addr2) {
+    #pragma omp atomic
+      ++err;
+    printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
+  }
+  // from neighbour w/o taskgroup (should start lookup from current tg2)
+  if (tid > 0) {
+    if (ptrs[tid-1]) {
+      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
+      if (addr1 != addr2) {
+        #pragma omp atomic
+          ++err;
+        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+               tid, addr1, addr2);
+      }
+    }
+  } else {
+    if (ptrs[nthreads-1]) {
+      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
+      if (addr1 != addr2) {
+        #pragma omp atomic
+          ++err;
+        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+               tid, addr1, addr2);
+      }
+    }
+  }
+  // ----------------------------------------------
+}
+            }
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:y) in_reduction(-:i,k)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
+                                    gtid, tg2, &ip);
+              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg2, &kp);
+              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg2, &yp);
+
+              // user's pseudo-code ==============================
+              *p_ip -= l;
+              *p_kp -= l;
+              *p_yp += (double)l;
+
+              *p_ip -= l + 1;
+              *p_kp -= l + 1;
+              *p_yp += (double)(l + 1);
+              // =================================================
+            }
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:j) in_reduction(*:x)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg1, &jp);
+              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg1, &xp);
+              // user's pseudo-code ==============================
+              *p_jp += l;
+              *p_xp *= (l + 1);
+
+              *p_jp += l + 1;
+              *p_xp *= 1.0 / (l + 2);
+              // =================================================
+            }
+          }
+        } // inner reduction
+
+        for( int l = 0; l < N; l += 2 ) {
+          #pragma omp task firstprivate(l) // in_reduction(+:j)
+          {
+            int gtid = __kmpc_global_thread_num(NULL);
+            long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                              gtid, tg1, &jp);
+            // user's pseudo-code ==============================
+            *p_jp += l;
+            *p_jp += l + 1;
+            // =================================================
+          }
+        }
+      } // outer reduction
+    } // end single
+  } // end parallel
+  // check results
+#if _DEBUG
+  printf("reduction flags = %u\n", FLG);
+#endif
+  if (ip == is && jp == js && ks == kp &&
+      fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
+    printf("passed\n");
+  else
+    printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
+      is, js, xs, ks, ys,
+      ip, jp, xp, kp, yp);
+  return 0;
+}
diff --git a/final/runtime/test/tasking/kmp_taskloop.c b/final/runtime/test/tasking/kmp_taskloop.c
new file mode 100644
index 0000000..4b13793
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_taskloop.c
@@ -0,0 +1,159 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+#define N 4
+#define GRAIN 10
+#define STRIDE 3
+
+// globals
+int th_counter[N];
+int counter;
+
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+typedef struct shar {
+    int(*pth_counter)[N];
+    int *pcounter;
+    int *pj;
+} *pshareds;
+
+typedef struct task {
+    pshareds shareds;
+    int(* routine)(int,struct task*);
+    int part_id;
+// privates:
+    unsigned long long lb; // library always uses ULONG
+    unsigned long long ub;
+    int st;
+    int last;
+    int i;
+    int j;
+    int th;
+} *ptask, kmp_task_t;
+
+typedef int(* task_entry_t)( int, ptask );
+
+void
+__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
+{
+// setup lastprivate flag
+    task_dst->last = lastpriv;
+// could be constructor calls here...
+}
+
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int nogroup, int sched, kmp_int64 grainsize, void *task_dup );
+ptask
+__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags,
+                  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                  task_entry_t task_entry );
+void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
+int  __kmpc_global_thread_num(void *id_ref);
+#ifdef __cplusplus
+}
+#endif
+
+
+// User's code
+int task_entry(int gtid, ptask task)
+{
+    pshareds pshar = task->shareds;
+    for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) {
+        task->th = omp_get_thread_num();
+        __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
+        __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1);
+        task->j = task->i;
+    }
+    my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
+    if( task->last ) {
+        *(pshar->pj) = task->j; // lastprivate
+    }
+    return 0;
+}
+
+int main()
+{
+    int i, j, gtid = __kmpc_global_thread_num(NULL);
+    ptask task;
+    pshareds psh;
+    omp_set_dynamic(0);
+    counter = 0;
+    for( i=0; i<N; ++i )
+        th_counter[i] = 0;
+    #pragma omp parallel num_threads(N)
+    {
+      #pragma omp master
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+/*
+ *  This is what the OpenMP runtime calls correspond to:
+    #pragma omp taskloop num_tasks(N) lastprivate(j)
+    for( i=0; i<N*GRAIN*STRIDE-1; i+=STRIDE )
+    {
+        int th = omp_get_thread_num();
+        #pragma omp atomic
+            counter++;
+        #pragma omp atomic
+            th_counter[th]++;
+        j = i;
+    }
+*/
+    task = __kmpc_omp_task_alloc(NULL,gtid,1,sizeof(struct task),sizeof(struct shar),&task_entry);
+    psh = task->shareds;
+    psh->pth_counter = &th_counter;
+    psh->pcounter = &counter;
+    psh->pj = &j;
+    task->lb = 0;
+    task->ub = N*GRAIN*STRIDE-2;
+    task->st = STRIDE;
+
+    __kmpc_taskloop(
+        NULL,             // location
+        gtid,             // gtid
+        task,             // task structure
+        1,                // if clause value
+        &task->lb,        // lower bound
+        &task->ub,        // upper bound
+        STRIDE,           // loop increment
+        0,                // 1 if nogroup specified
+        2,                // schedule type: 0-none, 1-grainsize, 2-num_tasks
+        N,                // schedule value (ignored for type 0)
+        (void*)&__task_dup_entry // tasks duplication routine
+        );
+      } // end master
+    } // end parallel
+// check results
+    if( j != N*GRAIN*STRIDE-STRIDE ) {
+        printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE);
+        return 1;
+    }
+    if( counter != N*GRAIN ) {
+        printf("Error, counter %d != %d\n",counter,N*GRAIN);
+        return 1;
+    }
+    for( i=0; i<N; ++i ) {
+        if( th_counter[i] % GRAIN ) {
+            printf("Error, th_counter[%d] = %d\n",i,th_counter[i]);
+            return 1;
+        }
+    }
+    printf("passed\n");
+    return 0;
+}
diff --git a/final/runtime/test/tasking/nested_parallel_tasking.c b/final/runtime/test/tasking/nested_parallel_tasking.c
new file mode 100644
index 0000000..4374d6e
--- /dev/null
+++ b/final/runtime/test/tasking/nested_parallel_tasking.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+
+/*
+ * This test would hang when level instead of active level
+ * used to push task state.
+ */
+
+int main()
+{
+  // If num_threads is changed to a value greater than 1, then the test passes
+  #pragma omp parallel num_threads(1)
+  {
+    #pragma omp parallel
+    printf("Hello World from thread %d\n", omp_get_thread_num());
+  }
+
+  printf("omp_num_threads: %d\n", omp_get_max_threads());
+
+  #pragma omp parallel
+  {
+    #pragma omp master
+    #pragma omp task default(none)
+    {
+      printf("%d is executing this task\n", omp_get_thread_num());
+    }
+  }
+
+  printf("pass\n");
+  return 0;
+}
diff --git a/final/runtime/test/tasking/nested_task_creation.c b/final/runtime/test/tasking/nested_task_creation.c
new file mode 100644
index 0000000..c7c25fc
--- /dev/null
+++ b/final/runtime/test/tasking/nested_task_creation.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+/*
+ * This test creates tasks that themselves create a new task.
+ * The runtime has to take care that they are correctly freed.
+ */
+
+int main()
+{
+  #pragma omp task
+  {
+    #pragma omp task
+    {
+      my_sleep( 0.1 );
+    }
+  }
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp single
+    #pragma omp task
+    {
+      #pragma omp task
+      {
+        my_sleep( 0.1 );
+      }
+    }
+  }
+
+  printf("pass\n");
+  return 0;
+}
diff --git a/final/runtime/test/tasking/omp_task.c b/final/runtime/test/tasking/omp_task.c
new file mode 100644
index 0000000..c534abe
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task()
+{
+  int tids[NUM_TASKS];
+  int i;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          my_sleep (SLEEPTIME);
+          tids[myi] = omp_get_thread_num();
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* Now we ckeck if more than one thread executed the tasks. */
+  for (i = 1; i < NUM_TASKS; i++) {
+    if (tids[0] != tids[i])
+      return 1;
+  }
+  return 0;
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_final.c b/final/runtime/test/tasking/omp_task_final.c
new file mode 100644
index 0000000..b531af6
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_final.c
@@ -0,0 +1,65 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task_final()
+{
+  int tids[NUM_TASKS];
+  int includedtids[NUM_TASKS];
+  int i;
+  int error = 0;
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+
+        #pragma omp task final(i>=10)
+        {
+          tids[myi] = omp_get_thread_num();
+          /* we generate included tasks for final tasks */
+          if(myi >= 10) {
+            int included = myi;
+            #pragma omp task
+            {
+              my_sleep (SLEEPTIME);
+              includedtids[included] = omp_get_thread_num();
+            } /* end of omp included task of the final task */
+            my_sleep (SLEEPTIME);
+          } /* end of if it is a final task*/
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* Now we ckeck if more than one thread executed the final task and its included task. */
+  for (i = 10; i < NUM_TASKS; i++) {
+    if (tids[i] != includedtids[i]) {
+      error++;
+    }
+  }
+  return (error==0);
+} /* end of check_paralel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_final()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
+
diff --git a/final/runtime/test/tasking/omp_task_firstprivate.c b/final/runtime/test/tasking/omp_task_firstprivate.c
new file mode 100644
index 0000000..d1f7c35
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_firstprivate.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_task_firstprivate()
+{
+  int i;
+  int sum = 1234;
+  int known_sum;
+  int result = 0; /* counts the wrong sums from tasks */
+
+  known_sum = 1234 + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task firstprivate(sum)
+        {
+          int j;
+          for (j = 0; j <= LOOPCOUNT; j++) {
+            #pragma omp flush
+            sum += j;
+          }
+
+          /* check if calculated sum was right */
+          if (sum != known_sum) {
+            #pragma omp critical
+            { result++; }
+          }
+        } /* omp task */
+      } /* for loop */
+    } /* omp single */
+  } /* omp parallel */
+  return (result == 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_if.c b/final/runtime/test/tasking/omp_task_if.c
new file mode 100644
index 0000000..8b4728e
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_if.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task_if()
+{
+  int condition_false;
+  int count;
+  int result;
+
+  count=0;
+  condition_false = (count == 1);
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      #pragma omp task if (condition_false) shared(count, result)
+      {
+        my_sleep (SLEEPTIME);
+        #pragma omp critical
+        result = (0 == count);
+      } /* end of omp task */
+      #pragma omp critical
+      count = 1;
+    } /* end of single */
+  } /*end of parallel */
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_imp_firstprivate.c b/final/runtime/test/tasking/omp_task_imp_firstprivate.c
new file mode 100644
index 0000000..905ab9a
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_imp_firstprivate.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_imp_firstprivate()
+{
+  int i=5;
+  int k = 0;
+  int result = 0;
+  int task_result = 1;
+  #pragma omp parallel firstprivate(i)
+  {
+    #pragma omp single
+    {
+      for (k = 0; k < NUM_TASKS; k++) {
+        #pragma omp task shared(result , task_result)
+        {
+          int j;
+          //check if i is private
+          if(i != 5)
+            task_result = 0;
+          for(j = 0; j < NUM_TASKS; j++)
+            i++;
+          //this should be firstprivate implicitly
+        }
+      }
+      #pragma omp taskwait
+      result = (task_result && i==5);
+    }
+  }
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_imp_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_priority.c b/final/runtime/test/tasking/omp_task_priority.c
new file mode 100644
index 0000000..7b62360
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_priority.c
@@ -0,0 +1,22 @@
+// RUN: %libomp-compile && env OMP_MAX_TASK_PRIORITY=42 %libomp-run
+// Test OMP 4.5 task priorities
+// Currently only API function and envirable parsing implemented.
+// Test environment sets envirable: OMP_MAX_TASK_PRIORITY=42 as tested below.
+#include <stdio.h>
+#include <omp.h>
+
+int main (void) {
+    int passed;
+
+    passed = (omp_get_max_task_priority() == 42);
+    printf("Got %d\n", omp_get_max_task_priority());
+
+    if (passed) {
+       printf("passed\n");
+       return 0;
+    }
+
+    printf("failed\n");
+    return 1;
+}
+
diff --git a/final/runtime/test/tasking/omp_task_private.c b/final/runtime/test/tasking/omp_task_private.c
new file mode 100644
index 0000000..7a93716
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_private.c
@@ -0,0 +1,53 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_private()
+{
+  int i;
+  int known_sum;
+  int sum = 0;
+  int result = 0; /* counts the wrong sums from tasks */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task private(sum) shared(result, known_sum)
+        {
+          int j;
+          //if sum is private, initialize to 0
+          sum = 0;
+          for (j = 0; j <= LOOPCOUNT; j++) {
+            #pragma omp flush
+            sum += j;
+          }
+          /* check if calculated sum was right */
+          if (sum != known_sum) {
+            #pragma omp critical
+            result++;
+          }
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel*/
+  return (result == 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_shared.c b/final/runtime/test/tasking/omp_task_shared.c
new file mode 100644
index 0000000..0304026
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_shared.c
@@ -0,0 +1,41 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_imp_shared()
+{
+  int i;
+  int k = 0;
+  int result = 0;
+  i=0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    for (k = 0; k < NUM_TASKS; k++) {
+      #pragma omp task shared(i)
+      {
+        #pragma omp atomic
+        i++;
+        //this should be shared implicitly
+      }
+    }
+  }
+  result = i;
+  return ((result == NUM_TASKS));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_imp_shared()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskloop_grainsize.c b/final/runtime/test/tasking/omp_taskloop_grainsize.c
new file mode 100644
index 0000000..0833073
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskloop_grainsize.c
@@ -0,0 +1,113 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+// GCC 6 has support for taskloops, but at least 6.3.0 is crashing on this test
+// UNSUPPORTED: gcc-6
+
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_grainsize()
+{
+  int result = 0;
+  int i, grainsize, count, tmp_count, num_off;
+  int *tmp, *tids, *tidsArray;
+
+  tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+  tids = tidsArray;
+
+  for (grainsize = 1; grainsize < 48; ++grainsize) {
+    fprintf(stderr, "Grainsize %d\n", grainsize);
+    count = tmp_count = num_off = 0;
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      tids[i] = -1;
+    }
+
+    #pragma omp parallel shared(tids)
+    {
+      #pragma omp master
+      #pragma omp taskloop grainsize(grainsize)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      if (tids[i] == -1) {
+        fprintf(stderr, "  Iteration %d not touched!\n", i);
+        result++;
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    tmp = (int *)malloc(sizeof(int) * (count + 1));
+    tmp[0] = 1;
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tmp_count > count) {
+        printf("--------------------\nTestinternal Error: List too "
+               "small!!!\n--------------------\n");
+        break;
+      }
+      if (tids[i] != tids[i + 1]) {
+        tmp_count++;
+        tmp[tmp_count] = 1;
+      } else {
+        tmp[tmp_count]++;
+      }
+    }
+
+    // is grainsize statement working?
+    int num_tasks = CFDMAX_SIZE / grainsize;
+    int multiple1 = CFDMAX_SIZE / num_tasks;
+    int multiple2 = CFDMAX_SIZE / num_tasks + 1;
+    for (i = 0; i < count; i++) {
+      // it is possible for 2 adjacent chunks assigned to a same thread
+      if (tmp[i] % multiple1 != 0 && tmp[i] % multiple2 != 0) {
+        num_off++;
+      }
+    }
+
+    if (num_off > 1) {
+      fprintf(stderr, "  The number of bad chunks is %d\n", num_off);
+      result++;
+    } else {
+      fprintf(stderr, "  Everything ok\n");
+    }
+
+    free(tmp);
+  }
+  free(tidsArray);
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_grainsize()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskloop_num_tasks.c b/final/runtime/test/tasking/omp_taskloop_num_tasks.c
new file mode 100644
index 0000000..7c3c704
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskloop_num_tasks.c
@@ -0,0 +1,71 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_num_tasks()
+{
+  int i;
+  int *tids;
+  int *tidsArray;
+  int count;
+  int result = 0;
+  int num_tasks;
+
+  for (num_tasks = 1; num_tasks < 120; ++num_tasks) {
+    count = 0;
+    tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+    tids = tidsArray;
+
+    #pragma omp parallel shared(tids)
+    {
+      int i;
+      #pragma omp master
+      #pragma omp taskloop num_tasks(num_tasks)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    if (count > num_tasks) {
+      fprintf(stderr, "counted too many tasks: (wanted %d, got %d)\n",
+              num_tasks, count);
+      result++;
+    }
+  }
+
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_num_tasks()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskwait.c b/final/runtime/test/tasking/omp_taskwait.c
new file mode 100644
index 0000000..c3a0ea7
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskwait.c
@@ -0,0 +1,74 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_taskwait()
+{
+  int result1 = 0;   /* Stores number of not finished tasks after the taskwait */
+  int result2 = 0;   /* Stores number of wrong array elements at the end */
+  int array[NUM_TASKS];
+  int i;
+
+  /* fill array */
+  for (i = 0; i < NUM_TASKS; i++)
+    array[i] = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          my_sleep (SLEEPTIME);
+          array[myi] = 1;
+        } /* end of omp task */
+      } /* end of for */
+      #pragma omp taskwait
+      /* check if all tasks were finished */
+      for (i = 0; i < NUM_TASKS; i++)
+        if (array[i] != 1)
+          result1++;
+
+      /* generate some more tasks which now shall overwrite
+       * the values in the tids array */
+      for (i = 0; i < NUM_TASKS; i++) {
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          array[myi] = 2;
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* final check, if all array elements contain the right values: */
+  for (i = 0; i < NUM_TASKS; i++) {
+    if (array[i] != 2)
+      result2++;
+  }
+  return ((result1 == 0) && (result2 == 0));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_taskwait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskyield.c b/final/runtime/test/tasking/omp_taskyield.c
new file mode 100644
index 0000000..5bb6984
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskyield.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_taskyield()
+{
+  int i;
+  int count = 0;
+  int start_tid[NUM_TASKS];
+  int current_tid[NUM_TASKS];
+
+  for (i=0; i< NUM_TASKS; i++) {
+    start_tid[i]=0;
+    current_tid[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        int myi = i;
+        #pragma omp task untied
+        {
+          my_sleep(SLEEPTIME);
+          start_tid[myi] = omp_get_thread_num();
+          #pragma omp taskyield
+          if((start_tid[myi] %2) ==0){
+            my_sleep(SLEEPTIME);
+            current_tid[myi] = omp_get_thread_num();
+          } /*end of if*/
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel */
+  for (i=0;i<NUM_TASKS; i++) {
+    //printf("start_tid[%d]=%d, current_tid[%d]=%d\n",
+      //i, start_tid[i], i , current_tid[i]);
+    if (current_tid[i] == start_tid[i])
+      count++;
+  }
+  return (count<NUM_TASKS);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_taskyield()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/threadprivate/omp_threadprivate.c b/final/runtime/test/threadprivate/omp_threadprivate.c
new file mode 100644
index 0000000..a3dd80d
--- /dev/null
+++ b/final/runtime/test/threadprivate/omp_threadprivate.c
@@ -0,0 +1,102 @@
+// RUN: %libomp-compile-and-run
+/*
+ * Threadprivate is tested in 2 ways:
+ * 1. The global variable declared as threadprivate should have
+ *  local copy for each thread. Otherwise race condition and
+ *  wrong result.
+ * 2. If the value of local copy is retained for the two adjacent
+ *  parallel regions
+ */
+#include "omp_testsuite.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+static int sum0=0;
+static int myvalue = 0;
+
+#pragma omp threadprivate(sum0)
+#pragma omp threadprivate(myvalue)
+
+int test_omp_threadprivate()
+{
+  int sum = 0;
+  int known_sum;
+  int i;
+  int iter;
+  int *data;
+  int size;
+  int num_failed = 0;
+  int my_random;
+  omp_set_dynamic(0);
+
+  #pragma omp parallel private(i)
+  {
+    sum0 = 0;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      sum0 = sum0 + i;
+    } /*end of for*/
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    } /*end of critical */
+  } /* end of parallel */
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  if (known_sum != sum ) {
+    fprintf (stderr, " known_sum = %d, sum = %d\n", known_sum, sum);
+  }
+
+  /* the next parallel region is just used to get the number of threads*/
+  omp_set_dynamic(0);
+  #pragma omp parallel
+  {
+    #pragma omp master
+    {
+      size=omp_get_num_threads();
+      data=(int*) malloc(size*sizeof(int));
+    }
+  }/* end parallel*/
+
+  srand(45);
+  for (iter = 0; iter < 100; iter++) {
+    my_random = rand(); /* random number generator is
+                 called inside serial region*/
+
+    /* the first parallel region is used to initialiye myvalue
+       and the array with my_random+rank */
+    #pragma omp parallel
+    {
+      int rank;
+      rank = omp_get_thread_num ();
+      myvalue = data[rank] = my_random + rank;
+    }
+
+    /* the second parallel region verifies that the
+       value of "myvalue" is retained */
+    #pragma omp parallel reduction(+:num_failed)
+    {
+      int rank;
+      rank = omp_get_thread_num ();
+      num_failed = num_failed + (myvalue != data[rank]);
+      if(myvalue != data[rank]) {
+        fprintf (stderr, " myvalue = %d, data[rank]= %d\n",
+          myvalue, data[rank]);
+      }
+    }
+  }
+  free (data);
+  return (known_sum == sum) && !num_failed;
+} /* end of check_threadprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_threadprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/threadprivate/omp_threadprivate_for.c b/final/runtime/test/threadprivate/omp_threadprivate_for.c
new file mode 100644
index 0000000..3342e63
--- /dev/null
+++ b/final/runtime/test/threadprivate/omp_threadprivate_for.c
@@ -0,0 +1,48 @@
+// RUN: %libomp-compile-and-run
+#include "omp_testsuite.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+static int i;
+#pragma omp threadprivate(i)
+
+int test_omp_threadprivate_for()
+{
+  int known_sum;
+  int sum;
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  sum = 0;
+
+  #pragma omp parallel
+  {
+    int sum0 = 0, i0;
+    #pragma omp for
+    for (i0 = 1; i0 <= LOOPCOUNT; i0++) {
+      i = i0;
+      sum0 = sum0 + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    }
+  } /* end of parallel */
+
+  if (known_sum != sum ) {
+    fprintf(stderr, " known_sum = %d, sum = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+} /* end of check_threadprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_threadprivate_for()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/bug_set_schedule_0.c b/final/runtime/test/worksharing/for/bug_set_schedule_0.c
new file mode 100644
index 0000000..889e239
--- /dev/null
+++ b/final/runtime/test/worksharing/for/bug_set_schedule_0.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_testsuite.h"
+
+/* Test that the chunk size is set to default (1) when
+   chunk size <= 0 is specified */
+int a = 0;
+
+int test_set_schedule_0()
+{
+  int i;
+  a = 0;
+  omp_set_schedule(omp_sched_dynamic,0);
+
+  #pragma omp parallel
+  {
+    #pragma omp for schedule(runtime)
+    for(i = 0; i < 10; i++) {
+      #pragma omp atomic
+      a++;
+      if(a > 10)
+        exit(1);
+    }
+  }
+  return a==10;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_set_schedule_0()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/kmp_doacross_check.c b/final/runtime/test/worksharing/for/kmp_doacross_check.c
new file mode 100644
index 0000000..59b61e3
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_doacross_check.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run
+// UNSUPPORTED: gcc
+// This test is incompatible with gcc because of the explicit call to
+// __kmpc_doacross_fini().  gcc relies on an implicit call to this function
+// when the last iteration is executed inside the GOMP_loop_*_next() functions.
+// Hence, in gcc, having the explicit call leads to __kmpc_doacross_fini()
+// being called twice.
+#include <stdio.h>
+
+#define N   1000
+
+struct dim {
+  long long lo; // lower
+  long long up; // upper
+  long long st; // stride
+};
+extern void __kmpc_doacross_init(void*, int, int, struct dim *);
+extern void __kmpc_doacross_wait(void*, int, long long*);
+extern void __kmpc_doacross_post(void*, int, long long*);
+extern void __kmpc_doacross_fini(void*, int);
+extern int __kmpc_global_thread_num(void*);
+
+int main()
+{
+  int i;
+  int iter[N];
+  struct dim dims;
+  for( i = 0; i < N; ++i )
+    iter[i] = 1;
+  dims.lo = 1;
+  dims.up = N-1;
+  dims.st = 1;
+  #pragma omp parallel num_threads(4)
+  {
+    int i, gtid;
+    long long vec;
+    gtid = __kmpc_global_thread_num(NULL);
+    __kmpc_doacross_init(NULL,gtid,1,&dims); // thread starts the loop
+    #pragma omp for nowait schedule(dynamic)
+    for( i = 1; i < N; ++i )
+    {
+      // runtime call corresponding to #pragma omp ordered depend(sink:i-1)
+      vec=i-1;
+      __kmpc_doacross_wait(NULL,gtid,&vec);
+      // user's code
+      iter[i] = iter[i-1] + 1;
+      // runtime call corresponding to #pragma omp ordered depend(source)
+      vec=i;
+      __kmpc_doacross_post(NULL,gtid,&vec);
+    }
+    // thread finishes the loop (should be before the loop barrier)
+    __kmpc_doacross_fini(NULL,gtid);
+  }
+  if( iter[N-1] == N ) {
+    printf("passed\n");
+  } else {
+    printf("failed %d != %d\n", iter[N-1], N);
+    return 1;
+  }
+  return 0;
+}
+
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c b/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c
new file mode 100644
index 0000000..5c6f94b
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c
@@ -0,0 +1,410 @@
+// RUN: %libomp-compile-and-run
+/*
+  Test for the 'schedule(simd:guided)' clause.
+  Compiler needs to generate a dynamic dispatching and pass the schedule
+  value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations.
+*/
+#include <stdio.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#endif
+
+// uncomment for debug diagnostics:
+//#define DEBUG
+
+#define SIMD_LEN 4
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+extern int __kmpc_global_thread_num(id*);
+extern void __kmpc_barrier(id*, int gtid);
+extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
+  int err = 0;
+  static int volatile loop_sync = 0;
+  i64 lb;   // Chunk lower bound
+  i64 ub;   // Chunk upper bound
+  i64 st;   // Chunk stride
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = tid;
+  int last;
+#if DEBUG
+  printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
+    (int)sizeof(i64), gtid, tid,
+    (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen
+  if (loop_st == 0)
+    return 0;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return 0;
+
+  __kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd,
+                         loop_lb, loop_ub, loop_st, loop_chunk);
+  if (tid == 0) {
+    // Let the master thread handle the chunks alone
+    int chunk;      // No of current chunk
+    i64 next_lb;    // Lower bound of the next chunk
+    i64 last_ub;    // Upper bound of the last processed chunk
+    u64 cur;        // Number of interations in  current chunk
+    u64 max;        // Max allowed iterations for current chunk
+    int undersized = 0;
+
+    chunk = 0;
+    next_lb = loop_lb;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations
+    while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if DEBUG
+      printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized
+      if (undersized) {
+        printf("Error with chunk %d\n", chunk);
+        err++;
+      }
+      // Check lower and upper bounds
+      if (lb != next_lb) {
+        printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
+        err++;
+      }
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub)) {
+          printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb <= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      } else {
+        if (!(ub >= loop_ub)) {
+          printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb >= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      }; // if
+      // Stride should not change
+      if (!(st == loop_st)) {
+        printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
+        err++;
+      }
+      cur = (ub - lb) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum
+      if (!(cur <= max + 1)) {
+        printf("Error with iter %d, %d\n", cur, max);
+        err++;
+      }
+      // Update maximum for the next chunk
+      if (cur < max)
+        max = cur;
+      next_lb = ub + loop_st;
+      last_ub = ub;
+      undersized = (cur < loop_chunk);
+    }; // while
+    // Must have at least one chunk
+    if (!(chunk > 0)) {
+      printf("Error with chunk %d\n", chunk);
+      err++;
+    }
+    // Must have the right last iteration index
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st > loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    } else {
+      if (!(last_ub >= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st < loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    }; // if
+    // Let non-master threads go
+    loop_sync = 1;
+  } else {
+    int i;
+    // Workers wait for master thread to finish, then call __kmpc_dispatch_next
+    for (i = 0; i < 1000000; ++ i) {
+      if (loop_sync != 0) {
+        break;
+      }; // if
+    }; // for i
+    while (loop_sync == 0) {
+      delay();
+    }; // while
+    // At this moment we do not have any more chunks -- all the chunks already
+    // processed by master thread
+    rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st);
+    if (rc) {
+      printf("Error return value\n");
+      err++;
+    }
+  }; // if
+
+  __kmpc_barrier(&loc, gtid);
+  if (tid == 0) {
+      loop_sync = 0;    // Restore original state
+#if DEBUG
+      printf("run_loop_64(): at the end\n");
+#endif
+  }; // if
+  __kmpc_barrier(&loc, gtid);
+  return err;
+} // run_loop
+
+// ---------------------------------------------------------------------------
+int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
+  int err = 0;
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound
+  int ub;   // Chunk upper bound
+  int st;   // Chunk stride
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = tid;
+  int last;
+#if DEBUG
+  printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
+    (int)sizeof(int), gtid, tid,
+    (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen
+  if (loop_st == 0)
+    return 0;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return 0;
+
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd,
+                         loop_lb, loop_ub, loop_st, loop_chunk);
+  if (tid == 0) {
+    // Let the master thread handle the chunks alone
+    int chunk;      // No of current chunk
+    int next_lb;    // Lower bound of the next chunk
+    int last_ub;    // Upper bound of the last processed chunk
+    u64 cur;        // Number of interations in  current chunk
+    u64 max;        // Max allowed iterations for current chunk
+    int undersized = 0;
+
+    chunk = 0;
+    next_lb = loop_lb;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if DEBUG
+      printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized
+      if (undersized) {
+        printf("Error with chunk %d\n", chunk);
+        err++;
+      }
+      // Check lower and upper bounds
+      if (lb != next_lb) {
+        printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
+        err++;
+      }
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub)) {
+          printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb <= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      } else {
+        if (!(ub >= loop_ub)) {
+          printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb >= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      }; // if
+      // Stride should not change
+      if (!(st == loop_st)) {
+        printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
+        err++;
+      }
+      cur = (ub - lb) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum
+      if (!(cur <= max + 1)) {
+        printf("Error with iter %d, %d\n", cur, max);
+        err++;
+      }
+      // Update maximum for the next chunk
+      if (cur < max)
+        max = cur;
+      next_lb = ub + loop_st;
+      last_ub = ub;
+      undersized = (cur < loop_chunk);
+    }; // while
+    // Must have at least one chunk
+    if (!(chunk > 0)) {
+      printf("Error with chunk %d\n", chunk);
+      err++;
+    }
+    // Must have the right last iteration index
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st > loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    } else {
+      if (!(last_ub >= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st < loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    }; // if
+    // Let non-master threads go
+    loop_sync = 1;
+  } else {
+    int i;
+    // Workers wait for master thread to finish, then call __kmpc_dispatch_next
+    for (i = 0; i < 1000000; ++ i) {
+      if (loop_sync != 0) {
+        break;
+      }; // if
+    }; // for i
+    while (loop_sync == 0) {
+      delay();
+    }; // while
+    // At this moment we do not have any more chunks -- all the chunks already
+    // processed by the master thread
+    rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st);
+    if (rc) {
+      printf("Error return value\n");
+      err++;
+    }
+  }; // if
+
+  __kmpc_barrier(&loc, gtid);
+  if (tid == 0) {
+      loop_sync = 0;    // Restore original state
+#if DEBUG
+      printf("run_loop<>(): at the end\n");
+#endif
+  }; // if
+  __kmpc_barrier(&loc, gtid);
+  return err;
+} // run_loop
+
+// ---------------------------------------------------------------------------
+int run_64(int num_th)
+{
+ int err = 0;
+#pragma omp parallel num_threads(num_th)
+ {
+  int chunk;
+  i64 st, lb, ub;
+  for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
+    for (st = 1; st <= 3; ++ st) {
+      for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
+        for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
+          err += run_loop_64(lb, ub,  st, chunk);
+          err += run_loop_64(ub, lb, -st, chunk);
+        }; // for ub
+      }; // for lb
+    }; // for st
+  }; // for chunk
+ }
+ return err;
+} // run_all
+
+int run_32(int num_th)
+{
+ int err = 0;
+#pragma omp parallel num_threads(num_th)
+ {
+  int chunk, st, lb, ub;
+  for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
+    for (st = 1; st <= 3; ++ st) {
+      for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
+        for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
+          err += run_loop_32(lb, ub,  st, chunk);
+          err += run_loop_32(ub, lb, -st, chunk);
+        }; // for ub
+      }; // for lb
+    }; // for st
+  }; // for chunk
+ }
+ return err;
+} // run_all
+
+// ---------------------------------------------------------------------------
+int main()
+{
+  int n, err = 0;
+  for (n = 1; n <= 4; ++ n) {
+    err += run_32(n);
+    err += run_64(n);
+  }; // for n
+  if (err)
+    printf("failed with %d errors\n", err);
+  else
+    printf("passed\n");
+  return err;
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
new file mode 100644
index 0000000..bb538d1
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
@@ -0,0 +1,221 @@
+// RUN: %libomp-compile-and-run
+
+// The test checks schedule(simd:runtime)
+// in combination with omp_set_schedule()
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (last) {
+        if (!no_chunk && cur > ch)
+          printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+                 (int)cur, ch, tid, ++err);
+      } else {
+        if (cur % ch)
+          printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+                 chunk, (int)cur, ch, tid, ++err);
+      }
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+// static (no chunk)
+  omp_set_schedule(omp_sched_static,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// auto (chunk should be ignorted)
+  omp_set_schedule(omp_sched_auto,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// static,1
+  chunk = 1;
+  omp_set_schedule(omp_sched_static,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// dynamic,1
+  omp_set_schedule(omp_sched_dynamic,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// guided,1
+  omp_set_schedule(omp_sched_guided,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// dynamic,0 - use default chunk size 1
+  omp_set_schedule(omp_sched_dynamic,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// guided,0 - use default chunk size 1
+  omp_set_schedule(omp_sched_guided,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
new file mode 100644
index 0000000..d137831
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
@@ -0,0 +1,196 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=guided    %libomp-run
+// RUN: env OMP_SCHEDULE=guided,1  %libomp-run 1
+// RUN: env OMP_SCHEDULE=guided,2  %libomp-run 2
+// RUN: env OMP_SCHEDULE=dynamic   %libomp-run
+// RUN: env OMP_SCHEDULE=dynamic,1 %libomp-run 1
+// RUN: env OMP_SCHEDULE=dynamic,2 %libomp-run 2
+// RUN: env OMP_SCHEDULE=auto      %libomp-run
+
+// The test checks schedule(simd:runtime)
+// in combination with OMP_SCHEDULE=guided[,chunk]
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define UBOUND 100
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (!last && cur % ch)
+        printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+               chunk, (int)cur, ch, tid, ++err);
+      if (last && !no_chunk && cur > ch)
+        printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+               (int)cur, ch, tid, ++err);
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+  if (argc > 1) {
+    // expect chunk size as a parameter
+    chunk = atoi(argv[1]);
+  }
+#pragma omp parallel //num_threads(num_th)
+  run_loop(0, UBOUND, 1, chunk);
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
new file mode 100644
index 0000000..4cb15d6
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
@@ -0,0 +1,201 @@
+// RUN: %libomp-compile && %libomp-run
+// RUN: %libomp-run 1 && %libomp-run 2
+
+// The test checks schedule(simd:runtime)
+// in combination with OMP_SCHEDULE=static[,chunk]
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (last) {
+        if (!no_chunk && cur > ch)
+          printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+                 (int)cur, ch, tid, ++err);
+      } else {
+        if (cur % ch)
+          printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+                 chunk, (int)cur, ch, tid, ++err);
+      }
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+  if (argc > 1) {
+    char *buf = malloc(8 + strlen(argv[1]));
+    // expect chunk size as a parameter
+    chunk = atoi(argv[1]);
+    strcpy(buf,"static,");
+    strcat(buf,argv[1]);
+    seten("OMP_SCHEDULE",buf,1);
+    printf("Testing schedule(simd:%s)\n", buf);
+    free(buf);
+  } else {
+    seten("OMP_SCHEDULE","static",1);
+    printf("Testing schedule(simd:static)\n");
+  }
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c b/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
new file mode 100644
index 0000000..a6378fe
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
@@ -0,0 +1,91 @@
+// RUN: %libomp-compile && %libomp-run 7
+// RUN: %libomp-run 0 && %libomp-run -1
+// RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run 7
+// RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 7
+#define MY_MAX 200
+#define MY_MIN -200
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE dynamic
+#endif
+
+int num_disp_buffers, num_loops;
+int a, b, a_known_value, b_known_value;
+
+int test_kmp_set_disp_num_buffers()
+{
+  int success = 1;
+  a = 0;
+  b = 0;
+  // run many small dynamic loops to stress the dispatch buffer system
+  #pragma omp parallel
+  {
+    int i,j;
+    for (j = 0; j < num_loops; j++) {
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+      }
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+      }
+    }
+  }
+  // detect failure
+  if (a != a_known_value || b != b_known_value) {
+    success = 0;
+    printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value,
+           b, b_known_value);
+  }
+  return success;
+}
+
+int main(int argc, char** argv)
+{
+  int i,j;
+  int num_failed=0;
+
+  if (argc != 2) {
+    fprintf(stderr, "usage: %s num_disp_buffers\n", argv[0]);
+    exit(1);
+  }
+
+  // set the number of dispatch buffers
+  num_disp_buffers = atoi(argv[1]);
+  kmp_set_disp_num_buffers(num_disp_buffers);
+
+  // figure out the known values to compare with calculated result
+  a_known_value = 0;
+  b_known_value = 0;
+
+  // if specified to use bad num_disp_buffers set num_loops
+  // to something reasonable
+  if (num_disp_buffers <= 0)
+    num_loops = 10;
+  else
+    num_loops = num_disp_buffers*10;
+
+  for (j = 0; j < num_loops; j++) {
+    for (i = MY_MIN; i < MY_MAX; i+=INCR)
+      a_known_value++;
+    for (i = MY_MAX; i >= MY_MIN; i-=INCR)
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_set_disp_num_buffers()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_doacross.c b/final/runtime/test/worksharing/for/omp_doacross.c
new file mode 100644
index 0000000..4187112
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_doacross.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+// XFAIL: gcc-4, gcc-5, clang-3.7, clang-3.8, icc-15, icc-16
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#ifndef N
+#define N 750
+#endif
+
+int test_doacross() {
+  int i, j;
+  // Allocate and zero out the matrix
+  int *m = (int *)malloc(sizeof(int) * N * N);
+  for (i = 0; i < N; ++i) {
+    for (j = 0; j < N; ++j) {
+      m[i * N + j] = 0;
+    }
+  }
+  // Have first row and column be 0, 1, 2, 3, etc.
+  for (i = 0; i < N; ++i)
+    m[i * N] = i;
+  for (j = 0; j < N; ++j)
+    m[j] = j;
+  // Perform wavefront which results in matrix:
+  // 0 1 2 3 4
+  // 1 2 3 4 5
+  // 2 3 4 5 6
+  // 3 4 5 6 7
+  // 4 5 6 7 8
+  #pragma omp parallel shared(m)
+  {
+    int row, col;
+    #pragma omp for ordered(2)
+    for (row = 1; row < N; ++row) {
+      for (col = 1; col < N; ++col) {
+        #pragma omp ordered depend(sink : row - 1, col) depend(sink : row, col - 1)
+        m[row * N + col] = m[(row - 1) * N + col] + m[row * N + (col - 1)] -
+                           m[(row - 1) * N + (col - 1)];
+        #pragma omp ordered depend(source)
+      }
+    }
+  }
+
+  // Check the bottom right element to see if iteration dependencies were held
+  int retval = (m[(N - 1) * N + N - 1] == 2 * (N - 1));
+  free(m);
+  return retval;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_doacross()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_bigbounds.c b/final/runtime/test/worksharing/for/omp_for_bigbounds.c
new file mode 100644
index 0000000..901d760
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_bigbounds.c
@@ -0,0 +1,70 @@
+// RUN: %libomp-compile -DMY_SCHEDULE=static && %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=dynamic && %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run
+
+// Only works with Intel Compiler since at least version 15.0
+// XFAIL: gcc, clang
+
+/*
+ * Test that large bounds are handled properly and calculations of
+ * loop iterations don't accidently overflow
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 50000000
+#define MY_MAX 2000000000
+#define MY_MIN -2000000000
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE static
+#endif
+
+int a, b, a_known_value, b_known_value;
+
+int test_omp_for_bigbounds()
+{
+  a = 0;
+  b = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for schedule(MY_SCHEDULE)
+    for (i = INT_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+    }
+    #pragma omp for schedule(MY_SCHEDULE)
+    for (i = INT_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+    }
+  }
+  printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value, b, b_known_value);
+  return (a == a_known_value && b == b_known_value);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  a_known_value = 0;
+  for (i = INT_MIN; i < MY_MAX; i+=INCR) {
+      a_known_value++;
+  }
+
+  b_known_value = 0;
+  for (i = INT_MAX; i >= MY_MIN; i-=INCR) {
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_bigbounds()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_collapse.c b/final/runtime/test/worksharing/for/omp_for_collapse.c
new file mode 100644
index 0000000..a08086d
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_collapse.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function to check that i is increasing monotonically
+   with each call */
+static int check_i_islarger (int i)
+{
+  static int last_i;
+  int islarger;
+  if (i==1)
+    last_i=0;
+  islarger = ((i >= last_i)&&(i - last_i<=1));
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_for_collapse()
+{
+  int is_larger = 1;
+
+  #pragma omp parallel
+  {
+    int i,j;
+    int my_islarger = 1;
+    #pragma omp for private(i,j) schedule(static,1) collapse(2) ordered
+    for (i = 1; i < 100; i++) {
+      for (j =1; j <100; j++) {
+        #pragma omp ordered
+        my_islarger = check_i_islarger(i)&&my_islarger;
+      }
+    }
+    #pragma omp critical
+    is_larger = is_larger && my_islarger;
+  }
+  return (is_larger);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_collapse()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_firstprivate.c b/final/runtime/test/worksharing/for/omp_for_firstprivate.c
new file mode 100644
index 0000000..6c4121c
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_firstprivate.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+  sum1 = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      threadsnum=omp_get_num_threads();
+    }
+    /* sum0 = 0; */
+
+    int i;
+    #pragma omp for firstprivate(sum0)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }  /* end of for */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /* end of critical */
+  }  /* end of parallel */
+  known_sum = 12345* threadsnum+ (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_lastprivate.c b/final/runtime/test/worksharing/for/omp_for_lastprivate.c
new file mode 100644
index 0000000..88694b8
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_lastprivate.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum0;
+#pragma omp threadprivate(sum0)
+
+int test_omp_for_lastprivate()
+{
+  int sum = 0;
+  int known_sum;
+  int i0;
+
+  i0 = -1;
+
+  #pragma omp parallel
+  {
+    sum0 = 0;
+    {  /* Begin of orphaned block */
+      int i;
+      #pragma omp for schedule(static,7) lastprivate(i0)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum0 + i;
+        i0 = i;
+      }  /* end of for */
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    }  /* end of critical */
+  }  /* end of parallel */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  fprintf(stderr, "known_sum = %d , sum = %d\n",known_sum,sum);
+  fprintf(stderr, "LOOPCOUNT = %d , i0 = %d\n",LOOPCOUNT,i0);
+  return ((known_sum == sum) && (i0 == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_nowait.c b/final/runtime/test/worksharing/for/omp_for_nowait.c
new file mode 100644
index 0000000..95a4775
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_nowait.c
@@ -0,0 +1,77 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * This test will hang if the nowait is not working properly.
+ *
+ * It relies on a thread skipping to the second for construct to
+ * release the threads in the first for construct.
+ *
+ * Also, we use static scheduling to guarantee that one
+ * thread will make it to the second for construct.
+ */
+volatile int release;
+volatile int count;
+
+void wait_for_release_then_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d enters first for construct"
+    " and waits.\n", rank);
+  while (release == 0);
+  #pragma omp atomic
+  count++;
+}
+
+void release_and_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d sets release to 1\n", rank);
+  release = 1;
+  #pragma omp atomic
+  count++;
+}
+
+int test_omp_for_nowait()
+{
+  release = 0;
+  count = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    int rank;
+    int i;
+
+    rank = omp_get_thread_num();
+
+    #pragma omp for schedule(static) nowait
+    for (i = 0; i < 4; i++) {
+      if (i < 3)
+        wait_for_release_then_increment(rank);
+      else {
+        fprintf(stderr, "Thread nr %d enters first for and goes "
+          "immediately to the next for construct to release.\n", rank);
+        #pragma omp atomic
+        count++;
+      }
+    }
+
+    #pragma omp for schedule(static)
+    for (i = 0; i < 4; i++) {
+        release_and_increment(rank);
+    }
+  }
+  return (count==8);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_nowait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_ordered.c b/final/runtime/test/worksharing/for/omp_for_ordered.c
new file mode 100644
index 0000000..18ac7eb
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_ordered.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+/* Utility function to check that i is increasing monotonically
+   with each call */
+static int check_i_islarger (int i)
+{
+  int islarger;
+  islarger = (i > last_i);
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_for_ordered()
+{
+  int sum;
+  int is_larger = 1;
+  int known_sum;
+
+  last_i = 0;
+  sum = 0;
+
+  #pragma omp parallel
+  {
+    int i;
+    int my_islarger = 1;
+    #pragma omp for schedule(static,1) ordered
+    for (i = 1; i < 100; i++) {
+      #pragma omp ordered
+      {
+        my_islarger = check_i_islarger(i) && my_islarger;
+        sum = sum + i;
+      }
+    }
+    #pragma omp critical
+    {
+      is_larger = is_larger && my_islarger;
+    }
+  }
+
+  known_sum=(99 * 100) / 2;
+  return ((known_sum == sum) && is_larger);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_ordered()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_private.c b/final/runtime/test/worksharing/for/omp_for_private.c
new file mode 100644
index 0000000..1f537b9
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_private.c
@@ -0,0 +1,63 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+static void do_some_work()
+{
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++){
+  sum += sqrt ((double) i);
+  }
+}
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_private()
+{
+  int sum = 0;
+  int sum0;
+  int known_sum;
+
+  sum0 = 0;  /* setting (global) sum0 = 0 */
+
+  #pragma omp parallel
+  {
+    sum1 = 0;  /* setting sum1 in each thread to 0 */
+    {  /* begin of orphaned block */
+      int i;
+      #pragma omp for private(sum0) schedule(static,1)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum1;
+        #pragma omp flush
+        sum0 = sum0 + i;
+        do_some_work ();
+        #pragma omp flush
+        sum1 = sum0;
+      }
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /*end of critical*/
+  }  /* end of parallel*/
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_reduction.c b/final/runtime/test/worksharing/for/omp_for_reduction.c
new file mode 100644
index 0000000..28f0907
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_reduction.c
@@ -0,0 +1,339 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20  /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_for_reduction ()
+{
+  double dt;
+  int sum;
+  int diff;
+  int product = 1;
+  double dsum;
+  double dknown_sum;
+  double ddiff;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int *logics;
+  int i;
+  int known_sum;
+  int known_product;
+  double rounding_error = 1.E-9; /* over all rounding error to be
+                    ignored in the double tests */
+  double dpt;
+  int result = 0;
+  int logicsArray[LOOPCOUNT];
+
+  /* Variables for integer tests */
+  sum = 0;
+  product = 1;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  /* variabels for double tests */
+  dt = 1. / 3.;  /* base of geometric row for + and - test*/
+  dsum = 0.;
+  /* Variabeles for logic  tests */
+  logics = logicsArray;
+  logic_and = 1;
+  logic_or = 0;
+  /* Variabeles for bit operators tests */
+  bit_and = 1;
+  bit_or = 0;
+  /* Variables for exclusiv bit or */
+  exclusiv_bit_or = 0;
+
+  /************************************************************************/
+  /** Tests for integers                         **/
+  /************************************************************************/
+
+  /**** Testing integer addition ****/
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(+:sum)
+    for (j = 1; j <= LOOPCOUNT; j++) {
+      sum = sum + j;
+    }
+  }
+  if (known_sum != sum) {
+    result++;
+    fprintf (stderr, "Error in sum with integers: Result was %d"
+      " instead of %d.\n", sum, known_sum);
+  }
+
+  /**** Testing integer subtracton ****/
+  diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(-:diff)
+    for (j = 1; j <= LOOPCOUNT; j++) {
+      diff = diff - j;
+    }
+  }
+  if (diff != 0) {
+    result++;
+    fprintf (stderr, "Error in difference with integers: Result was %d"
+      " instead of 0.\n", diff);
+  }
+
+  /**** Testing integer multiplication ****/
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(*:product)
+    for (j = 1; j <= MAX_FACTOR; j++) {
+      product *= j;
+    }
+  }
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf (stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n",product,known_product);
+  }
+
+  /************************************************************************/
+  /** Tests for doubles                          **/
+  /************************************************************************/
+
+  /**** Testing double addition ****/
+  dsum = 0.;
+  dpt = 1.;
+  for (i = 0; i < DOUBLE_DIGITS; ++i) {
+    dpt *= dt;
+  }
+  dknown_sum = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(+:dsum)
+    for (j = 0; j < DOUBLE_DIGITS; j++) {
+      dsum += pow (dt, j);
+    }
+  }
+  if (fabs (dsum - dknown_sum) > rounding_error) {
+    result++;
+    fprintf (stderr, "\nError in sum with doubles: Result was %f"
+      " instead of: %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  /**** Testing double subtraction ****/
+  ddiff = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(-:ddiff)
+    for (j = 0; j < DOUBLE_DIGITS; ++j) {
+      ddiff -= pow (dt, j);
+    }
+  }
+  if (fabs (ddiff) > rounding_error) {
+    result++;
+    fprintf (stderr, "Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n", ddiff);
+  }
+
+
+  /************************************************************************/
+  /** Tests for logical values                       **/
+  /************************************************************************/
+
+  /**** Testing logic and ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&&:logic_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_and = (logic_and && logics[j]);
+    }
+  }
+  if(!logic_and) {
+    result++;
+    fprintf (stderr, "Error in logic AND part 1\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&&:logic_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_and = logic_and && logics[j];
+    }
+  }
+  if(logic_and) {
+    result++;
+    fprintf (stderr, "Error in logic AND part 2\n");
+  }
+
+  /**** Testing logic or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(||:logic_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_or = logic_or || logics[j];
+    }
+  }
+  if (logic_or) {
+    result++;
+    fprintf (stderr, "Error in logic OR part 1\n");
+  }
+
+  logic_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(||:logic_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_or = logic_or || logics[j];
+    }
+  }
+  if(!logic_or) {
+    result++;
+    fprintf (stderr, "Error in logic OR part 2\n");
+  }
+
+  /************************************************************************/
+  /** Tests for bit values                         **/
+  /************************************************************************/
+
+  /**** Testing bit and ****/
+  for (i = 0; i < LOOPCOUNT; ++i) {
+    logics[i] = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&:bit_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_and = (bit_and & logics[j]);
+    }
+  }
+  if (!bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 1\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&:bit_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_and = bit_and & logics[j];
+    }
+  }
+  if (bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 2\n");
+  }
+
+  /**** Testing bit or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(|:bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_or = bit_or | logics[j];
+    }
+  }
+  if (bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 1\n");
+  }
+
+  bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(|:bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_or = bit_or | logics[j];
+    }
+  }
+  if (!bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 2\n");
+  }
+
+  /**** Testing exclusive bit or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(^:exclusiv_bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+    }
+  }
+  if (exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(^:exclusiv_bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+    }
+  }
+  if (!exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  return (result == 0);
+  free (logics);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_auto.c b/final/runtime/test/worksharing/for/omp_for_schedule_auto.c
new file mode 100644
index 0000000..075617c
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_auto.c
@@ -0,0 +1,69 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_auto()
+{
+  int j;
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+
+  // array which keeps track of which threads participated in the for loop
+  // e.g., given 4 threads, [ 0 | 1 | 1 | 0 ] implies
+  //       threads 0 and 3 did not, threads 1 and 2 did
+  int max_threads = omp_get_max_threads();
+  int* active_threads = (int*)malloc(sizeof(int)*max_threads);
+  for(j = 0; j < max_threads; j++)
+    active_threads[j] = 0;
+
+  #pragma omp parallel
+  {
+    int i;
+    sum1 = 0;
+    #pragma omp for firstprivate(sum0) schedule(auto)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      active_threads[omp_get_thread_num()] = 1;
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }
+  }
+
+  // count the threads that participated (sum is stored in threadsnum)
+  threadsnum=0;
+  for(j = 0; j < max_threads; j++) {
+    if(active_threads[j])
+      threadsnum++;
+  }
+  free(active_threads);
+
+  known_sum = 12345 * threadsnum + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_auto()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c b/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
new file mode 100644
index 0000000..6d4f59b
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
@@ -0,0 +1,89 @@
+// RUN: %libomp-compile-and-run
+/*
+ * Test for dynamic scheduling with chunk size
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested chunk size
+ *     unless it is the last one.
+ * It is possible for two adjacent chunks are assigned to the same thread
+ * Modified by Chunhua Liao
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 100
+const int chunk_size = 7;
+
+int test_omp_for_schedule_dynamic()
+{
+  int tid;
+  int *tids;
+  int i;
+  int tidsArray[CFDMAX_SIZE];
+  int count = 0;
+  int tmp_count = 0; /*dispatch times*/
+  int *tmp;  /*store chunk size for each dispatch*/
+  int result = 0;
+
+  tids = tidsArray;
+
+  #pragma omp parallel private(tid) shared(tids)
+  {        /* begin of parallel */
+    int tid;
+    tid = omp_get_thread_num ();
+    #pragma omp for schedule(dynamic,chunk_size)
+    for (i = 0; i < CFDMAX_SIZE; i++) {
+      tids[i] = tid;
+    }
+  }
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+    if (tids[i] != tids[i + 1]) {
+      count++;
+    }
+  }
+
+  tmp = (int *) malloc (sizeof (int) * (count + 1));
+  tmp[0] = 1;
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+    if (tmp_count > count) {
+      printf ("--------------------\nTestinternal Error: List too small!!!\n--------------------\n");  /* Error handling */
+      break;
+    }
+    if (tids[i] != tids[i + 1]) {
+      tmp_count++;
+      tmp[tmp_count] = 1;
+    } else {
+      tmp[tmp_count]++;
+    }
+  }
+  /* is dynamic statement working? */
+  for (i = 0; i < count; i++) {
+    if ((tmp[i]%chunk_size)!=0) {
+      /* it is possible for 2 adjacent chunks assigned to a same thread */
+      result++;
+      fprintf(stderr,"The intermediate dispatch has wrong chunksize.\n");
+      /* result += ((tmp[i] / chunk_size) - 1); */
+    }
+  }
+  if ((tmp[count]%chunk_size)!=(CFDMAX_SIZE%chunk_size)) {
+    result++;
+    fprintf(stderr,"the last dispatch has wrong chunksize.\n");
+  }
+  /* for (int i=0;i<count+1;++i) printf("%d\t:=\t%d\n",i+1,tmp[i]); */
+  return (result==0);
+}
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_dynamic()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_guided.c b/final/runtime/test/worksharing/for/omp_for_schedule_guided.c
new file mode 100644
index 0000000..1ee7449
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_guided.c
@@ -0,0 +1,217 @@
+// RUN: %libomp-compile-and-run
+
+/* Test for guided scheduling
+ * Ensure threads get chunks interleavely first
+ * Then judge the chunk sizes are decreasing to a stable value
+ * Modified by Chunhua Liao
+ * For example, 100 iteration on 2 threads, chunksize 7
+ * one line for each dispatch, 0/1 means thread id
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  24
+ * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1        18
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0            14
+ * 1 1 1 1 1 1 1 1 1 1                10
+ * 0 0 0 0 0 0 0 0                   8
+ * 1 1 1 1 1 1 1                   7
+ * 0 0 0 0 0 0 0                   7
+ * 1 1 1 1 1 1 1                   7
+ * 0 0 0 0 0                     5
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME  0.005
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0001
+#endif
+
+int test_omp_for_schedule_guided()
+{
+  int * tids;
+  int * chunksizes;
+  int notout;
+  int maxiter;
+  int threads;
+  int i;
+  int result;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  maxiter = 0;
+  result = 1;
+  notout = 1;
+
+  /* Testing if enough threads are available for this check. */
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      threads = omp_get_num_threads();
+    }
+  }
+
+  /* ensure there are at least two threads */
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+
+  /* Now the real parallel work:
+   * Each thread will start immediately with the first chunk.
+   */
+  #pragma omp parallel shared(tids,maxiter)
+  {  /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(guided)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      #pragma omp flush(maxiter,notout)
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+#ifdef VERBOSE
+        printf(".");
+#endif
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of for */
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /*******************************************************
+   * evaluation of the values              *
+   *******************************************************/
+  {
+    int determined_chunksize = 1;
+    int last_threadnr = tids[0];
+    int global_chunknr = 0;
+    int openwork = CFSMAX_SIZE;
+    int expected_chunk_size;
+    int* local_chunknr = (int*)malloc(threads * sizeof(int));
+    double c = 1;
+
+    for (i = 0; i < threads; i++)
+      local_chunknr[i] = 0;
+
+    tids[CFSMAX_SIZE] = -1;
+
+    /*
+     * determine the number of global chunks
+     */
+    // fprintf(stderr,"# global_chunknr thread local_chunknr chunksize\n");
+    for(i = 1; i <= CFSMAX_SIZE; ++i) {
+      if (last_threadnr==tids[i]) {
+        determined_chunksize++;
+      } else {
+        /* fprintf(stderr, "%d\t%d\t%d\t%d\n", global_chunknr,
+           last_threadnr, local_chunknr[last_threadnr], m); */
+        global_chunknr++;
+        local_chunknr[last_threadnr]++;
+        last_threadnr = tids[i];
+        determined_chunksize = 1;
+      }
+    }
+    /* now allocate the memory for saving the sizes of the global chunks */
+    chunksizes = (int*)malloc(global_chunknr * sizeof(int));
+
+    /*
+    * Evaluate the sizes of the global chunks
+    */
+    global_chunknr = 0;
+    determined_chunksize = 1;
+    last_threadnr = tids[0];
+    for (i = 1; i <= CFSMAX_SIZE; ++i) {
+      /* If the threadnumber was the same as before increase the
+       * detected chunksize for this chunk otherwise set the detected
+       * chunksize again to one and save the number of the next
+       * thread in last_threadnr.
+       */
+      if (last_threadnr == tids[i]) {
+        determined_chunksize++;
+      } else {
+        chunksizes[global_chunknr] = determined_chunksize;
+        global_chunknr++;
+        local_chunknr[last_threadnr]++;
+        last_threadnr = tids[i];
+        determined_chunksize = 1;
+      }
+    }
+
+#ifdef VERBOSE
+    fprintf(stderr, "found\texpected\tconstant\n");
+#endif
+
+    /* identify the constant c for the exponential
+       decrease of the chunksize */
+    expected_chunk_size = openwork / threads;
+    c = (double) chunksizes[0] / expected_chunk_size;
+
+    for (i = 0; i < global_chunknr; i++) {
+      /* calculate the new expected chunksize */
+      if (expected_chunk_size > 1)
+        expected_chunk_size = c * openwork / threads;
+#ifdef VERBOSE
+      fprintf(stderr, "%8d\t%8d\t%lf\n", chunksizes[i],
+        expected_chunk_size, c * chunksizes[i]/expected_chunk_size);
+#endif
+      /* check if chunksize is inside the rounding errors */
+      if (abs (chunksizes[i] - expected_chunk_size) >= 2) {
+        result = 0;
+#ifndef VERBOSE
+        fprintf(stderr, "Chunksize differed from expected "
+          "value: %d instead of %d\n", chunksizes[i],
+          expected_chunk_size);
+        return 0;
+#endif
+      } /* end if */
+
+#ifndef VERBOSE
+      if (expected_chunk_size - chunksizes[i] < 0)
+        fprintf(stderr, "Chunksize did not decrease: %d"
+          " instead of %d\n", chunksizes[i],expected_chunk_size);
+#endif
+
+      /* calculating the remaining amount of work */
+      openwork -= chunksizes[i];
+    }
+  }
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_guided()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c b/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c
new file mode 100644
index 0000000..b957fc3
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c
@@ -0,0 +1,82 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=static %libomp-run 1 0
+// RUN: env OMP_SCHEDULE=static,10 %libomp-run 1 10
+// RUN: env OMP_SCHEDULE=dynamic %libomp-run 2 1
+// RUN: env OMP_SCHEDULE=dynamic,11 %libomp-run 2 11
+// RUN: env OMP_SCHEDULE=guided %libomp-run 3 1
+// RUN: env OMP_SCHEDULE=guided,12 %libomp-run 3 12
+// RUN: env OMP_SCHEDULE=auto %libomp-run 4 1
+// RUN: env OMP_SCHEDULE=trapezoidal %libomp-run 101 1
+// RUN: env OMP_SCHEDULE=trapezoidal,13 %libomp-run 101 13
+// RUN: env OMP_SCHEDULE=static_steal %libomp-run 102 1
+// RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 102 14
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum;
+char* correct_kind_string;
+omp_sched_t correct_kind;
+int correct_chunk_size;
+
+int test_omp_for_runtime()
+{
+  int sum;
+  int known_sum;
+  int chunk_size;
+  int error;
+  omp_sched_t kind;
+
+  sum = 0;
+  error = 0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  omp_get_schedule(&kind, &chunk_size);
+
+  printf("omp_get_schedule() returns: Schedule = %d, Chunk Size = %d\n",
+         kind, chunk_size);
+  if (kind != correct_kind) {
+    printf("kind(%d) != correct_kind(%d)\n", kind, correct_kind);
+    error = 1;
+  }
+  if (chunk_size != correct_chunk_size) {
+    printf("chunk_size(%d) != correct_chunk_size(%d)\n", chunk_size,
+           correct_chunk_size);
+    error = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for schedule(runtime)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+        #pragma omp critical
+        sum+=i;
+    }
+  }
+  if (known_sum != sum) {
+    printf("Known Sum = %d, Calculated Sum = %d\n", known_sum, sum);
+    error = 1;
+  }
+  return !error;
+}
+
+int main(int argc, char** argv)
+{
+  int i;
+  int num_failed=0;
+  if (argc != 3) {
+    fprintf(stderr, "usage: %s schedule_kind chunk_size\n", argv[0]);
+    fprintf(stderr, "  Run with envirable OMP_SCHEDULE=kind[,chunk_size]\n");
+    return 1;
+  }
+  correct_kind = atoi(argv[1]);
+  correct_chunk_size = atoi(argv[2]);
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_for_runtime()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_static.c b/final/runtime/test/worksharing/for/omp_for_schedule_static.c
new file mode 100644
index 0000000..f46a544
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_static.c
@@ -0,0 +1,154 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+int test_omp_for_schedule_static()
+{
+  int threads;
+  int i,lasttid;
+  int * tids;
+  int notout;
+  int maxiter;
+  int chunk_size;
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+
+  chunk_size = 7;
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+  #pragma omp parallel shared(tids,counter)
+  {  /* begin of parallel*/
+    #pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }  /* end of single */
+  }  /* end of parallel */
+
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+  fprintf (stderr,"Using an internal count of %d\nUsing a specified"
+    " chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+  tids[CFSMAX_SIZE] = -1;  /* setting endflag */
+  #pragma omp parallel shared(tids)
+  { /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(static,chunk_size)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+        printf(".");
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of for */
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /**** analysing the data in array tids ****/
+
+  lasttid = tids[0];
+  tmp_count = 0;
+
+  for (i = 0; i < CFSMAX_SIZE + 1; ++i) {
+    /* If the work  was done by the same thread increase tmp_count by one. */
+    if (tids[i] == lasttid) {
+      tmp_count++;
+#ifdef VERBOSE
+      fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+      continue;
+    }
+
+    /* Check if the next thread had has the right thread number. When finding
+     * threadnumber -1 the end should be reached.
+     */
+    if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+      /* checking for the right chunk size */
+      if (tmp_count == chunk_size) {
+        tmp_count = 1;
+        lasttid = tids[i];
+#ifdef VERBOSE
+        fprintf (stderr, "OK\n");
+#endif
+      } else {
+        /* If the chunk size was wrong, check if the end was reached */
+        if (tids[i] == -1) {
+          if (i == CFSMAX_SIZE) {
+            fprintf (stderr, "Last thread had chunk size %d\n",
+              tmp_count);
+            break;
+          } else {
+            fprintf (stderr, "ERROR: Last thread (thread with"
+              " number -1) was found before the end.\n");
+            result = 0;
+          }
+        } else {
+          fprintf (stderr, "ERROR: chunk size was %d. (assigned"
+            " was %d)\n", tmp_count, chunk_size);
+          result = 0;
+        }
+      }
+    } else {
+      fprintf(stderr, "ERROR: Found thread with number %d (should be"
+        " inbetween 0 and %d).", tids[i], threads - 1);
+      result = 0;
+    }
+#ifdef VERBOSE
+    fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+  }
+
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_static()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c b/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c
new file mode 100644
index 0000000..922f27a
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c
@@ -0,0 +1,202 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+#define VERBOSE 0
+
+int test_omp_for_schedule_static_3()
+{
+  int threads;
+  int i,lasttid;
+
+  int * tids;
+  int * tids2;
+  int notout;
+  int maxiter;
+  int chunk_size;
+
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+  chunk_size = 7;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+  #pragma omp parallel shared(tids,counter)
+  {  /* begin of parallel*/
+    #pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }  /* end of single */
+  }  /* end of parallel */
+
+  /* Ensure that at least two threads are created */
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+  fprintf (stderr,"Using an internal count of %d\nUsing a"
+    " specified chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+  tids[CFSMAX_SIZE] = -1;  /* setting endflag */
+
+  #pragma omp parallel shared(tids)
+  {  /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(static,chunk_size)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+        printf(".");
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of omp parallel for */
+
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /**** analysing the data in array tids ****/
+
+  lasttid = tids[0];
+  tmp_count = 0;
+
+  for (i = 0; i < CFSMAX_SIZE + 1; ++i) {
+    /* If the work  was done by the same thread
+       increase tmp_count by one. */
+    if (tids[i] == lasttid) {
+      tmp_count++;
+#ifdef VERBOSE
+      fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+      continue;
+    }
+
+    /* Check if the next thread had has the right thread number.
+     * When finding threadnumber -1 the end should be reached.
+     */
+    if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+      /* checking for the right chunk size */
+      if (tmp_count == chunk_size) {
+        tmp_count = 1;
+        lasttid = tids[i];
+#ifdef VERBOSE
+        fprintf (stderr, "OK\n");
+#endif
+      } else {
+        /* If the chunk size was wrong, check if the end was reached */
+        if (tids[i] == -1) {
+          if (i == CFSMAX_SIZE) {
+            fprintf (stderr, "Last thread had chunk size %d\n",
+              tmp_count);
+            break;
+          } else {
+            fprintf (stderr, "ERROR: Last thread (thread with"
+              " number -1) was found before the end.\n");
+            result = 0;
+          }
+        } else {
+          fprintf (stderr, "ERROR: chunk size was %d. (assigned"
+            " was %d)\n", tmp_count, chunk_size);
+          result = 0;
+        }
+      }
+    } else {
+      fprintf(stderr, "ERROR: Found thread with number %d (should be"
+        " inbetween 0 and %d).", tids[i], threads - 1);
+      result = 0;
+    }
+#ifdef VERBOSE
+    fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+  }
+
+  /* Now we check if several loop regions in one parallel region have the
+   * same logical assignement of chunks to threads. We use the nowait
+   * clause to increase the probability to get an error. */
+
+  /* First we allocate some more memmory */
+  free (tids);
+  tids = (int *) malloc (sizeof (int) * LOOPCOUNT);
+  tids2 = (int *) malloc (sizeof (int) * LOOPCOUNT);
+
+  #pragma omp parallel
+  {
+    {
+      int n;
+      #pragma omp for schedule(static) nowait
+      for (n = 0; n < LOOPCOUNT; n++) {
+        if (LOOPCOUNT == n + 1 )
+          my_sleep(SLEEPTIME);
+
+        tids[n] = omp_get_thread_num();
+      }
+    }
+    {
+      int m;
+      #pragma omp for schedule(static) nowait
+      for (m = 1; m <= LOOPCOUNT; m++) {
+        tids2[m-1] = omp_get_thread_num();
+      }
+    }
+  }
+
+  for (i = 0; i < LOOPCOUNT; i++)
+  if (tids[i] != tids2[i]) {
+    fprintf (stderr, "Chunk no. %d was assigned once to thread %d and"
+      " later to thread %d.\n", i, tids[i],tids2[i]);
+    result = 0;
+  }
+
+  free (tids);
+  free (tids2);
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_static_3()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c b/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c
new file mode 100644
index 0000000..3b3bf7d
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_firstprivate()
+{
+  int sum ;
+  int i2;
+  int i;
+  int known_sum;
+
+  sum=0;
+  i2=3;
+
+  #pragma omp parallel for reduction(+:sum) private(i) firstprivate(i2)
+  for (i = 1; i <= LOOPCOUNT; i++) {
+    sum = sum + (i + i2);
+  }
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 + i2 * LOOPCOUNT;
+  return (known_sum == sum);
+} /* end of check_parallel_for_fistprivate */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_if.c b/final/runtime/test/worksharing/for/omp_parallel_for_if.c
new file mode 100644
index 0000000..57fe498
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_if.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_if()
+{
+  int known_sum;
+  int num_threads;
+  int sum, sum2;
+  int i;
+  int control;
+
+  control = 0;
+  num_threads=0;
+  sum = 0;
+  sum2 = 0;
+
+  #pragma omp parallel for private(i) if (control==1)
+  for (i=0; i <= LOOPCOUNT; i++) {
+    num_threads = omp_get_num_threads();
+    sum = sum + i;
+  }
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  fprintf(stderr, "Number of threads determined by"
+    " omp_get_num_threads: %d\n", num_threads);
+  return (known_sum == sum && num_threads == 1);
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c b/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c
new file mode 100644
index 0000000..a53cfb2
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_lastprivate()
+{
+  int sum;
+  int i;
+  int i0;
+  int known_sum;
+
+  sum =0;
+  i0 = -1;
+
+  #pragma omp parallel for reduction(+:sum) \
+    schedule(static,7) private(i) lastprivate(i0)
+  for (i = 1; i <= LOOPCOUNT; i++) {
+    sum = sum + i;
+    i0 = i;
+  } /* end of parallel for */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return ((known_sum == sum) && (i0 == LOOPCOUNT));
+} /* end of check_parallel_for_lastprivate */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c b/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c
new file mode 100644
index 0000000..5fef460
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+int i;
+#pragma omp threadprivate(i)
+
+/* Variable ii is used to avoid problems with a threadprivate variable used as a loop
+ * index. See test omp_threadprivate_for.
+ */
+static int ii;
+#pragma omp threadprivate(ii)
+
+/*!
+  Utility function: returns true if the passed argument is larger than
+  the argument of the last call of this function.
+ */
+static int check_i_islarger2(int i)
+{
+  int islarger;
+  islarger = (i > last_i);
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_parallel_for_ordered()
+{
+  int sum;
+  int is_larger;
+  int known_sum;
+  int i;
+
+  sum = 0;
+  is_larger = 1;
+  last_i = 0;
+  #pragma omp parallel for schedule(static,1) private(i) ordered
+  for (i = 1; i < 100; i++) {
+    ii = i;
+    #pragma omp ordered
+    {
+      is_larger = check_i_islarger2 (ii) && is_larger;
+      sum  = sum + ii;
+    }
+  }
+  known_sum = (99 * 100) / 2;
+  fprintf (stderr," known_sum = %d , sum = %d \n", known_sum, sum);
+  fprintf (stderr," is_larger = %d\n", is_larger);
+  return (known_sum == sum) && is_larger;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_ordered()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_private.c b/final/runtime/test/worksharing/for/omp_parallel_for_private.c
new file mode 100644
index 0000000..1231d36
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_private.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/*! Utility function to spend some time in a loop */
+static void do_some_work (void)
+{
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++){
+    sum += sqrt (i);
+  }
+}
+
+int test_omp_parallel_for_private()
+{
+  int sum;
+  int i;
+  int i2;
+  int known_sum;
+
+  sum =0;
+  i2=0;
+
+  #pragma omp parallel for reduction(+:sum) schedule(static,1) private(i) private(i2)
+  for (i=1;i<=LOOPCOUNT;i++)
+  {
+    i2 = i;
+    #pragma omp flush
+    do_some_work ();
+    #pragma omp flush
+    sum = sum + i2;
+  } /*end of for*/
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c b/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c
new file mode 100644
index 0000000..118d730
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c
@@ -0,0 +1,266 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20    /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_parallel_for_reduction()
+{
+  int sum;
+  int known_sum;
+  double dsum;
+  double dknown_sum;
+  double dt=0.5;  /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-9;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[LOOPCOUNT];
+  int i;
+  double dpt;
+  int result;
+
+  sum =0;
+  dsum=0;
+  dt = 1./3.;
+  result = 0;
+  product = 1;
+  logic_and=1;
+  logic_or=0;
+  bit_and=1;
+  bit_or=0;
+  exclusiv_bit_or=0;
+
+  /* Tests for integers */
+  known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:sum)
+  for (i=1;i<=LOOPCOUNT;i++) {
+    sum=sum+i;
+  }
+  if(known_sum!=sum) {
+    result++;
+    fprintf(stderr,"Error in sum with integers: Result was %d"
+      " instead of %d\n",sum,known_sum);
+  }
+
+  diff = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:diff)
+  for (i=1;i<=LOOPCOUNT;++i) {
+    diff=diff-i;
+  }
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in difference with integers: Result was %d"
+      " instead of 0.\n",diff);
+  }
+
+  /* Tests for doubles */
+  dsum=0;
+  dpt=1;
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:dsum)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dsum += pow(dt,i);
+  }
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f"
+      " instead of %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  dpt=1;
+
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:ddiff)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    ddiff -= pow(dt,i);
+  }
+  if( fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n",ddiff);
+  }
+
+  /* Tests for integers */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(*:product)
+  for(i=1;i<=MAX_FACTOR;i++) {
+    product *= i;
+  }
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n\n",product,known_product);
+  }
+
+  /* Tests for logic AND */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = (logic_and && logics[i]);
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1.\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = logic_and && logics[i];
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2.\n");
+  }
+
+  /* Tests for logic OR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 1.\n");
+  }
+  logic_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2.\n");
+  }
+
+  /* Tests for bitwise AND */
+  for(i=0;i<LOOPCOUNT;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = (bit_and & logics[i]);
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1.\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = bit_and & logics[i];
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2.\n");
+  }
+
+  /* Tests for bitwise OR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  /* Tests for bitwise XOR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_firstprivate.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_firstprivate.c
new file mode 100644
index 0000000..1780fab
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_firstprivate.c
@@ -0,0 +1,54 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+
+  sum =7;
+  sum0=11;
+
+  #pragma omp parallel sections firstprivate(sum0)
+  {
+    #pragma omp section
+    {
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+  }
+
+  known_sum=11*3+7;
+  return (known_sum==sum);
+} /* end of check_section_firstprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_lastprivate.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_lastprivate.c
new file mode 100644
index 0000000..9b775ec
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_lastprivate.c
@@ -0,0 +1,71 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_lastprivate()
+{
+  int sum;
+  int sum0;
+  int i;
+  int i0;
+  int known_sum;
+  sum =0;
+  sum0 = 0;
+  i0 = -1;
+
+  #pragma omp parallel sections private(i,sum0) lastprivate(i0)
+  {
+    #pragma omp section
+    {
+      sum0=0;
+      for (i=1;i<400;i++) {
+        sum0=sum0+i;
+        i0=i;
+      }
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=400;i<700;i++) {
+        sum0=sum0+i;
+        i0=i;
+      }
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=700;i<1000;i++) {
+        sum0=sum0+i;
+        i0=i;
+      }
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+  }
+
+  known_sum=(999*1000)/2;
+  return ((known_sum==sum) && (i0==999) );
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_private.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_private.c
new file mode 100644
index 0000000..7dab295
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_private.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_private()
+{
+  int sum;
+  int sum0;
+  int i;
+  int known_sum;
+
+  sum = 7;
+  sum0=0;
+
+  #pragma omp parallel sections private(sum0, i)
+  {
+    #pragma omp section
+    {
+      sum0=0;
+      for (i=1;i<400;i++)
+        sum0=sum0+i;
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=400;i<700;i++)
+        sum0=sum0+i;
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=700;i<1000;i++)
+        sum0=sum0+i;
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+  }
+
+  known_sum=(999*1000)/2+7;
+  return (known_sum==sum);
+} /* end of check_section_private*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_reduction.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_reduction.c
new file mode 100644
index 0000000..0d49865
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_reduction.c
@@ -0,0 +1,508 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_reduction()
+{
+  int sum;
+  int known_sum;
+  double dpt;
+  double dsum;
+  double dknown_sum;
+  double dt=0.5; /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-5;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int bit_and;
+  int logic_or;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[1000];
+  int i;
+  int result;
+
+  sum = 7;
+  dsum=0;
+  product =1;
+  dpt = 1;
+  logic_and=1;
+  bit_and=1;
+  logic_or=0;
+  bit_or=0;
+  exclusiv_bit_or=0;
+  result =0;
+  /*  int my_islarger;*/
+  /*int is_larger=1;*/
+
+  // Test summation of integers
+  known_sum = (999*1000)/2+7;
+  #pragma omp parallel sections private(i) reduction(+:sum)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        sum=sum+i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        sum=sum+i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        sum=sum+i;
+      }
+    }
+  }
+  if(known_sum!=sum) {
+    result++;
+    fprintf(stderr,"Error in sum with integers: Result was %d"
+      " instead of %d.\n",sum, known_sum);
+  }
+
+  // Test differences of integers
+  diff = (999*1000)/2;
+  #pragma omp parallel sections private(i) reduction(-:diff)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        diff=diff-i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        diff=diff-i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        diff=diff-i;
+      }
+    }
+  }
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in Difference with integers: Result was %d"
+      " instead of 0.\n",diff);
+  }
+
+  // Test summation of doubles
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel sections private(i) reduction(+:dsum)
+  {
+    #pragma omp section
+    {
+      for (i=0;i<6;++i) {
+        dsum += pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=6;i<12;++i) {
+        dsum += pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=12;i<20;++i) {
+        dsum += pow(dt,i);
+      }
+    }
+  }
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f"
+      " instead of %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  // Test differences of doubles
+  dpt=1;
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel sections private(i) reduction(-:ddiff)
+  {
+    #pragma omp section
+    {
+      for (i=0;i<6;++i) {
+        ddiff -= pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=6;i<12;++i) {
+        ddiff -= pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=12;i<20;++i) {
+        ddiff -= pow(dt,i);
+      }
+    }
+  }
+  if( fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n",ddiff);
+  }
+
+  // Test product of integers
+  known_product = 3628800;
+  #pragma omp parallel sections private(i) reduction(*:product)
+  {
+    #pragma omp section
+    {
+      for(i=1;i<3;i++) {
+        product *= i;
+      }
+    }
+    #pragma omp section
+    {
+      for(i=3;i<7;i++) {
+        product *= i;
+      }
+    }
+    #pragma omp section
+    {
+      for(i=7;i<11;i++) {
+        product *= i;
+      }
+    }
+  }
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n",product,known_product);
+  }
+
+  // Test logical AND
+  for(i=0;i<1000;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel sections private(i) reduction(&&:logic_and)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1\n");
+  }
+  logic_and = 1;
+  logics[501] = 0;
+
+  #pragma omp parallel sections private(i) reduction(&&:logic_and)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2");
+  }
+
+  // Test logical OR
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel sections private(i) reduction(||:logic_or)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 1\n");
+  }
+
+  logic_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel sections private(i) reduction(||:logic_or)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2\n");
+  }
+
+  // Test bitwise AND
+  for(i=0;i<1000;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel sections private(i) reduction(&:bit_and)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_and = (bit_and & logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_and = (bit_and & logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_and = (bit_and & logics[i]);
+      }
+    }
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1\n");
+  }
+
+  bit_and = 1;
+  logics[501]=0;
+
+  #pragma omp parallel sections private(i) reduction(&:bit_and)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_and = bit_and & logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_and = bit_and & logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_and = bit_and & logics[i];
+      }
+    }
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2");
+  }
+
+  // Test bitwise OR
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel sections private(i) reduction(|:bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel sections private(i) reduction(|:bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  // Test bitwise XOR
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel sections private(i) reduction(^:exclusiv_bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel sections private(i) reduction(^:exclusiv_bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_section_firstprivate.c b/final/runtime/test/worksharing/sections/omp_section_firstprivate.c
new file mode 100644
index 0000000..5526475
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_section_firstprivate.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_section_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+
+  sum0 = 11;
+  sum = 7;
+  #pragma omp parallel
+  {
+    #pragma omp  sections firstprivate(sum0)
+    {
+      #pragma omp section
+      {
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+    }
+  }
+  known_sum = 11 * 3 + 7;
+  return (known_sum == sum);
+} /* end of check_section_firstprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_section_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_section_lastprivate.c b/final/runtime/test/worksharing/sections/omp_section_lastprivate.c
new file mode 100644
index 0000000..0dbbea9
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_section_lastprivate.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_section_lastprivate()
+{
+  int i0 = -1;
+  int sum = 0;
+  int i;
+  int sum0 = 0;
+  int known_sum;
+
+  i0 = -1;
+  sum = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections lastprivate(i0) private(i,sum0)
+    {
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 1; i < 400; i++)
+        {
+          sum0 = sum0 + i;
+          i0 = i;
+        }
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        } /*end of critical*/
+      } /* end of section */
+      #pragma omp section
+      {
+        sum0 = 0;
+        for(i = 400; i < 700; i++)
+        {
+          sum0 = sum0 + i;
+          i0 = i;
+        }
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        } /*end of critical*/
+      }
+      #pragma omp section
+      {
+        sum0 = 0;
+        for(i = 700; i < 1000; i++)
+        {
+          sum0 = sum0 + i;
+          i0 = i;
+        }
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        } /*end of critical*/
+      } /* end of section */
+    } /* end of sections*/
+  } /* end of parallel*/
+  known_sum = (999 * 1000) / 2;
+  return ((known_sum == sum) && (i0 == 999) );
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_section_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_section_private.c b/final/runtime/test/worksharing/sections/omp_section_private.c
new file mode 100644
index 0000000..bf2a30d
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_section_private.c
@@ -0,0 +1,66 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_section_private()
+{
+  int sum;
+  int sum0;
+  int i;
+  int known_sum;
+
+  sum = 7;
+  sum0 = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(sum0,i)
+    {
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 1; i < 400; i++)
+          sum0 = sum0 + i;
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 400; i < 700; i++)
+          sum0 = sum0 + i;
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 700; i < 1000; i++)
+          sum0 = sum0 + i;
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+    } /*end of sections*/
+  } /* end of parallel */
+  known_sum = (999 * 1000) / 2 + 7;
+  return (known_sum == sum);
+} /* end of check_section_private*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_section_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_sections_nowait.c b/final/runtime/test/worksharing/sections/omp_sections_nowait.c
new file mode 100644
index 0000000..caff254
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_sections_nowait.c
@@ -0,0 +1,104 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * This test will hang if the nowait is not working properly
+ *
+ * It relies on a thread skipping to the second sections construct to
+ * release the threads in the first sections construct
+ *
+ * Also, since scheduling of sections is implementation defined, it is
+ * necessary to have all four sections in the second sections construct
+ * release the threads since we can't guarantee which section a single thread
+ * will execute.
+ */
+volatile int release;
+volatile int count;
+
+void wait_for_release_then_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d enters first section"
+    " and waits.\n", rank);
+  while (release == 0);
+  #pragma omp atomic
+  count++;
+}
+
+void release_and_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d sets release to 1\n", rank);
+  release = 1;
+  #pragma omp flush(release)
+  #pragma omp atomic
+  count++;
+}
+
+int test_omp_sections_nowait()
+{
+  release = 0;
+  count = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    #pragma omp sections nowait
+    {
+      #pragma omp section
+      {
+        wait_for_release_then_increment(rank);
+      }
+      #pragma omp section
+      {
+        wait_for_release_then_increment(rank);
+      }
+      #pragma omp section
+      {
+        wait_for_release_then_increment(rank);
+      }
+      #pragma omp section
+      {
+        fprintf(stderr, "Thread nr %d enters first sections and goes "
+          "immediately to next sections construct to release.\n", rank);
+        #pragma omp atomic
+        count++;
+      }
+    }
+    /* Begin of second sections environment */
+    #pragma omp sections
+    {
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+    }
+  }
+  // Check to make sure all eight sections were executed
+  return (count==8);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_sections_nowait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_sections_reduction.c b/final/runtime/test/worksharing/sections/omp_sections_reduction.c
new file mode 100644
index 0000000..1fdb5ec
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_sections_reduction.c
@@ -0,0 +1,543 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_sections_reduction()
+{
+  int sum;
+  int known_sum;
+  double dpt,dsum;
+  double dknown_sum;
+  double dt=0.5; /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-9;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int bit_and;
+  int logic_or;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[1000];
+  int i;
+  int result;
+  /* int my_islarger; */
+  /*int is_larger=1;*/
+  sum =7;
+  dpt =1;
+  dsum=0;
+  product =1;
+  logic_and=1;
+  bit_and=1;
+  logic_or=0;
+  bit_or=0;
+  exclusiv_bit_or=0;
+  result = 0;
+  dt = 1./3.;
+
+  known_sum = (999*1000)/2+7;
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(+:sum)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          sum=sum+i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          sum=sum+i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          sum=sum+i;
+        }
+      }
+    }
+  }
+  if(known_sum!=sum) {
+    ++result;
+    fprintf(stderr,"Error in sum with integers: Result was %d"
+      " instead of %d\n", sum,known_sum);
+  }
+
+  diff = (999*1000)/2;
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(-:diff)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          diff=diff-i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          diff=diff-i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          diff=diff-i;
+        }
+      }
+    }
+  }
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in Difference with integers: Result was %d"
+      " instead of 0.\n",diff);
+  }
+
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(+:dsum)
+    {
+      #pragma omp section
+      {
+        for (i=0;i<6;++i) {
+          dsum += pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=6;i<12;++i) {
+          dsum += pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=12;i<20;++i) {
+          dsum += pow(dt,i);
+        }
+      }
+    }
+  }
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f"
+      " instead of %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  dpt=1;
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(-:ddiff)
+    {
+      #pragma omp section
+      {
+        for (i=0;i<6;++i) {
+          ddiff -= pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=6;i<12;++i) {
+          ddiff -= pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=12;i<20;++i) {
+          ddiff -= pow(dt,i);
+        }
+      }
+    }
+  }
+
+  if(fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n",ddiff);
+  }
+
+  known_product = 3628800;
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(*:product)
+    {
+      #pragma omp section
+      {
+        for(i=1;i<3;i++) {
+          product *= i;
+        }
+      }
+      #pragma omp section
+      {
+        for(i=3;i<7;i++) {
+          product *= i;
+        }
+      }
+      #pragma omp section
+      {
+        for(i=7;i<11;i++) {
+          product *= i;
+        }
+      }
+    }
+  }
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n",product,known_product);
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&&:logic_and)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+    }
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1\n");
+  }
+
+  logic_and = 1;
+  logics[501] = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&&:logic_and)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+    }
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2\n");
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(||:logic_or)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+    }
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"\nError in logic OR part 1\n");
+  }
+
+  logic_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(||:logic_or)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+    }
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2\n");
+  }
+
+  for(i=0;i<1000;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&:bit_and)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_and = (bit_and & logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_and = (bit_and & logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_and = (bit_and & logics[i]);
+        }
+      }
+    }
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1\n");
+  }
+
+  bit_and = 1;
+  logics[501]=0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&:bit_and)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_and = bit_and & logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_and = bit_and & logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_and = bit_and & logics[i];
+        }
+      }
+    }
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2\n");
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(|:bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+    }
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(|:bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+    }
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(^:exclusiv_bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+    }
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(^:exclusiv_bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+    }
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_sections_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single.c b/final/runtime/test/worksharing/single/omp_single.c
new file mode 100644
index 0000000..4963579
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_single()
+{
+  int nr_threads_in_single;
+  int result;
+  int nr_iterations;
+  int i;
+
+  nr_threads_in_single = 0;
+  result = 0;
+  nr_iterations = 0;
+
+  #pragma omp parallel private(i)
+  {
+    for (i = 0; i < LOOPCOUNT; i++) {
+      #pragma omp single
+      {
+        #pragma omp flush
+        nr_threads_in_single++;
+        #pragma omp flush
+        nr_iterations++;
+        nr_threads_in_single--;
+        result = result + nr_threads_in_single;
+      }
+    }
+  }
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+} /* end of check_single*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single_copyprivate.c b/final/runtime/test/worksharing/single/omp_single_copyprivate.c
new file mode 100644
index 0000000..2fece5c
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single_copyprivate.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+#include "omp_testsuite.h"
+
+#define DEBUG_TEST 0
+
+int j;
+#pragma omp threadprivate(j)
+
+int test_omp_single_copyprivate()
+{
+  int result;
+  int nr_iterations;
+
+  result = 0;
+  nr_iterations = 0;
+  #pragma omp parallel num_threads(4)
+  {
+    int i;
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+#if DEBUG_TEST
+         int thread;
+         thread = omp_get_thread_num ();
+#endif
+      #pragma omp single copyprivate(j)
+      {
+        nr_iterations++;
+        j = i;
+#if DEBUG_TEST
+        printf ("thread %d assigns, j = %d, i = %d\n", thread, j, i);
+#endif
+      }
+#if DEBUG_TEST
+      #pragma omp barrier
+#endif
+      #pragma omp critical
+      {
+#if DEBUG_TEST
+        printf ("thread = %d, j = %d, i = %d\n", thread, j, i);
+#endif
+        result = result + j - i;
+      }
+      #pragma omp barrier
+    } /* end of for */
+  } /* end of parallel */
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_copyprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single_nowait.c b/final/runtime/test/worksharing/single/omp_single_nowait.c
new file mode 100644
index 0000000..22f8930
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single_nowait.c
@@ -0,0 +1,73 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * This test will hang if the nowait is not working properly
+ *
+ * It relies on a one thread skipping to the last single construct to
+ * release the threads in the first three single constructs
+ */
+volatile int release;
+volatile int count;
+
+void wait_for_release_then_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d enters first section"
+    " and waits.\n", rank);
+  while (release == 0);
+  #pragma omp atomic
+  count++;
+}
+
+void release_and_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d sets release to 1\n", rank);
+  release = 1;
+  #pragma omp atomic
+  count++;
+}
+
+int test_omp_single_nowait()
+{
+  release = 0;
+  count = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    #pragma omp single nowait
+    {
+      wait_for_release_then_increment(rank);
+    }
+    #pragma omp single nowait
+    {
+      wait_for_release_then_increment(rank);
+    }
+    #pragma omp single nowait
+    {
+      wait_for_release_then_increment(rank);
+    }
+
+    #pragma omp single
+    {
+      release_and_increment(rank);
+    }
+  }
+  // Check to make sure all four singles were executed
+  return (count==4);
+} /* end of check_single_nowait*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_nowait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single_private.c b/final/runtime/test/worksharing/single/omp_single_private.c
new file mode 100644
index 0000000..a27f8de
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single_private.c
@@ -0,0 +1,57 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int myit = 0;
+#pragma omp threadprivate(myit)
+int myresult = 0;
+#pragma omp threadprivate(myresult)
+
+int test_omp_single_private()
+{
+  int nr_threads_in_single;
+  int result;
+  int nr_iterations;
+  int i;
+
+  myit = 0;
+  nr_threads_in_single = 0;
+  nr_iterations = 0;
+  result = 0;
+
+  #pragma omp parallel private(i)
+  {
+    myresult = 0;
+    myit = 0;
+    for (i = 0; i < LOOPCOUNT; i++) {
+      #pragma omp single private(nr_threads_in_single) nowait
+      {
+        nr_threads_in_single = 0;
+        #pragma omp flush
+        nr_threads_in_single++;
+        #pragma omp flush
+        myit++;
+        myresult = myresult + nr_threads_in_single;
+      }
+    }
+    #pragma omp critical
+    {
+      result += nr_threads_in_single;
+      nr_iterations += myit;
+    }
+  }
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+} /* end of check_single private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}