28 files changed, 3355 insertions, 0 deletions
diff --git a/final/runtime/test/worksharing/for/bug_set_schedule_0.c b/final/runtime/test/worksharing/for/bug_set_schedule_0.c
new file mode 100644
index 0000000..889e239
--- /dev/null
+++ b/final/runtime/test/worksharing/for/bug_set_schedule_0.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_testsuite.h"
+
+/* Test that the chunk size is set to default (1) when
+   chunk size <= 0 is specified */
+int a = 0;
+
+int test_set_schedule_0()
+{
+  int i;
+  a = 0;
+  omp_set_schedule(omp_sched_dynamic,0);
+
+  #pragma omp parallel
+  {
+    #pragma omp for schedule(runtime)
+    for(i = 0; i < 10; i++) {
+      #pragma omp atomic
+      a++;
+      if(a > 10)
+        exit(1);
+    }
+  }
+  return a==10;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_set_schedule_0()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/kmp_doacross_check.c b/final/runtime/test/worksharing/for/kmp_doacross_check.c
new file mode 100644
index 0000000..59b61e3
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_doacross_check.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run
+// UNSUPPORTED: gcc
+// This test is incompatible with gcc because of the explicit call to
+// __kmpc_doacross_fini().  gcc relies on an implicit call to this function
+// when the last iteration is executed inside the GOMP_loop_*_next() functions.
+// Hence, in gcc, having the explicit call leads to __kmpc_doacross_fini()
+// being called twice.
+#include <stdio.h>
+
+#define N   1000
+
+struct dim {
+  long long lo; // lower
+  long long up; // upper
+  long long st; // stride
+};
+extern void __kmpc_doacross_init(void*, int, int, struct dim *);
+extern void __kmpc_doacross_wait(void*, int, long long*);
+extern void __kmpc_doacross_post(void*, int, long long*);
+extern void __kmpc_doacross_fini(void*, int);
+extern int __kmpc_global_thread_num(void*);
+
+int main()
+{
+  int i;
+  int iter[N];
+  struct dim dims;
+  for( i = 0; i < N; ++i )
+    iter[i] = 1;
+  dims.lo = 1;
+  dims.up = N-1;
+  dims.st = 1;
+  #pragma omp parallel num_threads(4)
+  {
+    int i, gtid;
+    long long vec;
+    gtid = __kmpc_global_thread_num(NULL);
+    __kmpc_doacross_init(NULL,gtid,1,&dims); // thread starts the loop
+    #pragma omp for nowait schedule(dynamic)
+    for( i = 1; i < N; ++i )
+    {
+      // runtime call corresponding to #pragma omp ordered depend(sink:i-1)
+      vec=i-1;
+      __kmpc_doacross_wait(NULL,gtid,&vec);
+      // user's code
+      iter[i] = iter[i-1] + 1;
+      // runtime call corresponding to #pragma omp ordered depend(source)
+      vec=i;
+      __kmpc_doacross_post(NULL,gtid,&vec);
+    }
+    // thread finishes the loop (should be before the loop barrier)
+    __kmpc_doacross_fini(NULL,gtid);
+  }
+  if( iter[N-1] == N ) {
+    printf("passed\n");
+  } else {
+    printf("failed %d != %d\n", iter[N-1], N);
+    return 1;
+  }
+  return 0;
+}
+
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c b/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c
new file mode 100644
index 0000000..5c6f94b
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c
@@ -0,0 +1,410 @@
+// RUN: %libomp-compile-and-run
+/*
+  Test for the 'schedule(simd:guided)' clause.
+  Compiler needs to generate a dynamic dispatching and pass the schedule
+  value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations.
+*/
+#include <stdio.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#endif
+
+// uncomment for debug diagnostics:
+//#define DEBUG
+
+#define SIMD_LEN 4
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+extern int __kmpc_global_thread_num(id*);
+extern void __kmpc_barrier(id*, int gtid);
+extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
+  int err = 0;
+  static int volatile loop_sync = 0;
+  i64 lb;   // Chunk lower bound
+  i64 ub;   // Chunk upper bound
+  i64 st;   // Chunk stride
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = tid;
+  int last;
+#if DEBUG
+  printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
+    (int)sizeof(i64), gtid, tid,
+    (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen
+  if (loop_st == 0)
+    return 0;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return 0;
+
+  __kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd,
+                         loop_lb, loop_ub, loop_st, loop_chunk);
+  if (tid == 0) {
+    // Let the master thread handle the chunks alone
+    int chunk;      // No of current chunk
+    i64 next_lb;    // Lower bound of the next chunk
+    i64 last_ub;    // Upper bound of the last processed chunk
+    u64 cur;        // Number of interations in  current chunk
+    u64 max;        // Max allowed iterations for current chunk
+    int undersized = 0;
+
+    chunk = 0;
+    next_lb = loop_lb;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations
+    while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if DEBUG
+      printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized
+      if (undersized) {
+        printf("Error with chunk %d\n", chunk);
+        err++;
+      }
+      // Check lower and upper bounds
+      if (lb != next_lb) {
+        printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
+        err++;
+      }
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub)) {
+          printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb <= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      } else {
+        if (!(ub >= loop_ub)) {
+          printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb >= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      }; // if
+      // Stride should not change
+      if (!(st == loop_st)) {
+        printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
+        err++;
+      }
+      cur = (ub - lb) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum
+      if (!(cur <= max + 1)) {
+        printf("Error with iter %d, %d\n", cur, max);
+        err++;
+      }
+      // Update maximum for the next chunk
+      if (cur < max)
+        max = cur;
+      next_lb = ub + loop_st;
+      last_ub = ub;
+      undersized = (cur < loop_chunk);
+    }; // while
+    // Must have at least one chunk
+    if (!(chunk > 0)) {
+      printf("Error with chunk %d\n", chunk);
+      err++;
+    }
+    // Must have the right last iteration index
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st > loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    } else {
+      if (!(last_ub >= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st < loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    }; // if
+    // Let non-master threads go
+    loop_sync = 1;
+  } else {
+    int i;
+    // Workers wait for master thread to finish, then call __kmpc_dispatch_next
+    for (i = 0; i < 1000000; ++ i) {
+      if (loop_sync != 0) {
+        break;
+      }; // if
+    }; // for i
+    while (loop_sync == 0) {
+      delay();
+    }; // while
+    // At this moment we do not have any more chunks -- all the chunks already
+    // processed by master thread
+    rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st);
+    if (rc) {
+      printf("Error return value\n");
+      err++;
+    }
+  }; // if
+
+  __kmpc_barrier(&loc, gtid);
+  if (tid == 0) {
+      loop_sync = 0;    // Restore original state
+#if DEBUG
+      printf("run_loop_64(): at the end\n");
+#endif
+  }; // if
+  __kmpc_barrier(&loc, gtid);
+  return err;
+} // run_loop
+
+// ---------------------------------------------------------------------------
+int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
+  int err = 0;
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound
+  int ub;   // Chunk upper bound
+  int st;   // Chunk stride
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = tid;
+  int last;
+#if DEBUG
+  printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
+    (int)sizeof(int), gtid, tid,
+    (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen
+  if (loop_st == 0)
+    return 0;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return 0;
+
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd,
+                         loop_lb, loop_ub, loop_st, loop_chunk);
+  if (tid == 0) {
+    // Let the master thread handle the chunks alone
+    int chunk;      // No of current chunk
+    int next_lb;    // Lower bound of the next chunk
+    int last_ub;    // Upper bound of the last processed chunk
+    u64 cur;        // Number of interations in  current chunk
+    u64 max;        // Max allowed iterations for current chunk
+    int undersized = 0;
+
+    chunk = 0;
+    next_lb = loop_lb;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if DEBUG
+      printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized
+      if (undersized) {
+        printf("Error with chunk %d\n", chunk);
+        err++;
+      }
+      // Check lower and upper bounds
+      if (lb != next_lb) {
+        printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
+        err++;
+      }
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub)) {
+          printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb <= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      } else {
+        if (!(ub >= loop_ub)) {
+          printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb >= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      }; // if
+      // Stride should not change
+      if (!(st == loop_st)) {
+        printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
+        err++;
+      }
+      cur = (ub - lb) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum
+      if (!(cur <= max + 1)) {
+        printf("Error with iter %d, %d\n", cur, max);
+        err++;
+      }
+      // Update maximum for the next chunk
+      if (cur < max)
+        max = cur;
+      next_lb = ub + loop_st;
+      last_ub = ub;
+      undersized = (cur < loop_chunk);
+    }; // while
+    // Must have at least one chunk
+    if (!(chunk > 0)) {
+      printf("Error with chunk %d\n", chunk);
+      err++;
+    }
+    // Must have the right last iteration index
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st > loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    } else {
+      if (!(last_ub >= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st < loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    }; // if
+    // Let non-master threads go
+    loop_sync = 1;
+  } else {
+    int i;
+    // Workers wait for master thread to finish, then call __kmpc_dispatch_next
+    for (i = 0; i < 1000000; ++ i) {
+      if (loop_sync != 0) {
+        break;
+      }; // if
+    }; // for i
+    while (loop_sync == 0) {
+      delay();
+    }; // while
+    // At this moment we do not have any more chunks -- all the chunks already
+    // processed by the master thread
+    rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st);
+    if (rc) {
+      printf("Error return value\n");
+      err++;
+    }
+  }; // if
+
+  __kmpc_barrier(&loc, gtid);
+  if (tid == 0) {
+      loop_sync = 0;    // Restore original state
+#if DEBUG
+      printf("run_loop<>(): at the end\n");
+#endif
+  }; // if
+  __kmpc_barrier(&loc, gtid);
+  return err;
+} // run_loop
+
+// ---------------------------------------------------------------------------
+int run_64(int num_th)
+{
+ int err = 0;
+#pragma omp parallel num_threads(num_th)
+ {
+  int chunk;
+  i64 st, lb, ub;
+  for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
+    for (st = 1; st <= 3; ++ st) {
+      for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
+        for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
+          err += run_loop_64(lb, ub,  st, chunk);
+          err += run_loop_64(ub, lb, -st, chunk);
+        }; // for ub
+      }; // for lb
+    }; // for st
+  }; // for chunk
+ }
+ return err;
+} // run_all
+
+int run_32(int num_th)
+{
+ int err = 0;
+#pragma omp parallel num_threads(num_th)
+ {
+  int chunk, st, lb, ub;
+  for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
+    for (st = 1; st <= 3; ++ st) {
+      for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
+        for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
+          err += run_loop_32(lb, ub,  st, chunk);
+          err += run_loop_32(ub, lb, -st, chunk);
+        }; // for ub
+      }; // for lb
+    }; // for st
+  }; // for chunk
+ }
+ return err;
+} // run_all
+
+// ---------------------------------------------------------------------------
+int main()
+{
+  int n, err = 0;
+  for (n = 1; n <= 4; ++ n) {
+    err += run_32(n);
+    err += run_64(n);
+  }; // for n
+  if (err)
+    printf("failed with %d errors\n", err);
+  else
+    printf("passed\n");
+  return err;
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
new file mode 100644
index 0000000..bb538d1
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
@@ -0,0 +1,221 @@
+// RUN: %libomp-compile-and-run
+
+// The test checks schedule(simd:runtime)
+// in combination with omp_set_schedule()
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (last) {
+        if (!no_chunk && cur > ch)
+          printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+                 (int)cur, ch, tid, ++err);
+      } else {
+        if (cur % ch)
+          printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+                 chunk, (int)cur, ch, tid, ++err);
+      }
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+// static (no chunk)
+  omp_set_schedule(omp_sched_static,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// auto (chunk should be ignorted)
+  omp_set_schedule(omp_sched_auto,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// static,1
+  chunk = 1;
+  omp_set_schedule(omp_sched_static,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// dynamic,1
+  omp_set_schedule(omp_sched_dynamic,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// guided,1
+  omp_set_schedule(omp_sched_guided,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// dynamic,0 - use default chunk size 1
+  omp_set_schedule(omp_sched_dynamic,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// guided,0 - use default chunk size 1
+  omp_set_schedule(omp_sched_guided,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
new file mode 100644
index 0000000..d137831
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
@@ -0,0 +1,196 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=guided    %libomp-run
+// RUN: env OMP_SCHEDULE=guided,1  %libomp-run 1
+// RUN: env OMP_SCHEDULE=guided,2  %libomp-run 2
+// RUN: env OMP_SCHEDULE=dynamic   %libomp-run
+// RUN: env OMP_SCHEDULE=dynamic,1 %libomp-run 1
+// RUN: env OMP_SCHEDULE=dynamic,2 %libomp-run 2
+// RUN: env OMP_SCHEDULE=auto      %libomp-run
+
+// The test checks schedule(simd:runtime)
+// in combination with OMP_SCHEDULE=guided[,chunk]
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define UBOUND 100
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (!last && cur % ch)
+        printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+               chunk, (int)cur, ch, tid, ++err);
+      if (last && !no_chunk && cur > ch)
+        printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+               (int)cur, ch, tid, ++err);
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+  if (argc > 1) {
+    // expect chunk size as a parameter
+    chunk = atoi(argv[1]);
+  }
+#pragma omp parallel //num_threads(num_th)
+  run_loop(0, UBOUND, 1, chunk);
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
new file mode 100644
index 0000000..4cb15d6
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
@@ -0,0 +1,201 @@
+// RUN: %libomp-compile && %libomp-run
+// RUN: %libomp-run 1 && %libomp-run 2
+
+// The test checks schedule(simd:runtime)
+// in combination with OMP_SCHEDULE=static[,chunk]
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (last) {
+        if (!no_chunk && cur > ch)
+          printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+                 (int)cur, ch, tid, ++err);
+      } else {
+        if (cur % ch)
+          printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+                 chunk, (int)cur, ch, tid, ++err);
+      }
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+  if (argc > 1) {
+    char *buf = malloc(8 + strlen(argv[1]));
+    // expect chunk size as a parameter
+    chunk = atoi(argv[1]);
+    strcpy(buf,"static,");
+    strcat(buf,argv[1]);
+    seten("OMP_SCHEDULE",buf,1);
+    printf("Testing schedule(simd:%s)\n", buf);
+    free(buf);
+  } else {
+    seten("OMP_SCHEDULE","static",1);
+    printf("Testing schedule(simd:static)\n");
+  }
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c b/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
new file mode 100644
index 0000000..a6378fe
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
@@ -0,0 +1,91 @@
+// RUN: %libomp-compile && %libomp-run 7
+// RUN: %libomp-run 0 && %libomp-run -1
+// RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run 7
+// RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 7
+#define MY_MAX 200
+#define MY_MIN -200
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE dynamic
+#endif
+
+int num_disp_buffers, num_loops;
+int a, b, a_known_value, b_known_value;
+
+int test_kmp_set_disp_num_buffers()
+{
+  int success = 1;
+  a = 0;
+  b = 0;
+  // run many small dynamic loops to stress the dispatch buffer system
+  #pragma omp parallel
+  {
+    int i,j;
+    for (j = 0; j < num_loops; j++) {
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+      }
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+      }
+    }
+  }
+  // detect failure
+  if (a != a_known_value || b != b_known_value) {
+    success = 0;
+    printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value,
+           b, b_known_value);
+  }
+  return success;
+}
+
+int main(int argc, char** argv)
+{
+  int i,j;
+  int num_failed=0;
+
+  if (argc != 2) {
+    fprintf(stderr, "usage: %s num_disp_buffers\n", argv[0]);
+    exit(1);
+  }
+
+  // set the number of dispatch buffers
+  num_disp_buffers = atoi(argv[1]);
+  kmp_set_disp_num_buffers(num_disp_buffers);
+
+  // figure out the known values to compare with calculated result
+  a_known_value = 0;
+  b_known_value = 0;
+
+  // if specified to use bad num_disp_buffers set num_loops
+  // to something reasonable
+  if (num_disp_buffers <= 0)
+    num_loops = 10;
+  else
+    num_loops = num_disp_buffers*10;
+
+  for (j = 0; j < num_loops; j++) {
+    for (i = MY_MIN; i < MY_MAX; i+=INCR)
+      a_known_value++;
+    for (i = MY_MAX; i >= MY_MIN; i-=INCR)
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_set_disp_num_buffers()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_doacross.c b/final/runtime/test/worksharing/for/omp_doacross.c
new file mode 100644
index 0000000..4187112
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_doacross.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+// XFAIL: gcc-4, gcc-5, clang-3.7, clang-3.8, icc-15, icc-16
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#ifndef N
+#define N 750
+#endif
+
+int test_doacross() {
+  int i, j;
+  // Allocate and zero out the matrix
+  int *m = (int *)malloc(sizeof(int) * N * N);
+  for (i = 0; i < N; ++i) {
+    for (j = 0; j < N; ++j) {
+      m[i * N + j] = 0;
+    }
+  }
+  // Have first row and column be 0, 1, 2, 3, etc.
+  for (i = 0; i < N; ++i)
+    m[i * N] = i;
+  for (j = 0; j < N; ++j)
+    m[j] = j;
+  // Perform wavefront which results in matrix:
+  // 0 1 2 3 4
+  // 1 2 3 4 5
+  // 2 3 4 5 6
+  // 3 4 5 6 7
+  // 4 5 6 7 8
+  #pragma omp parallel shared(m)
+  {
+    int row, col;
+    #pragma omp for ordered(2)
+    for (row = 1; row < N; ++row) {
+      for (col = 1; col < N; ++col) {
+        #pragma omp ordered depend(sink : row - 1, col) depend(sink : row, col - 1)
+        m[row * N + col] = m[(row - 1) * N + col] + m[row * N + (col - 1)] -
+                           m[(row - 1) * N + (col - 1)];
+        #pragma omp ordered depend(source)
+      }
+    }
+  }
+
+  // Check the bottom right element to see if iteration dependencies were held
+  int retval = (m[(N - 1) * N + N - 1] == 2 * (N - 1));
+  free(m);
+  return retval;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_doacross()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_bigbounds.c b/final/runtime/test/worksharing/for/omp_for_bigbounds.c
new file mode 100644
index 0000000..901d760
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_bigbounds.c
@@ -0,0 +1,70 @@
+// RUN: %libomp-compile -DMY_SCHEDULE=static && %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=dynamic && %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run
+
+// Only works with Intel Compiler since at least version 15.0
+// XFAIL: gcc, clang
+
+/*
+ * Test that large bounds are handled properly and calculations of
+ * loop iterations don't accidently overflow
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 50000000
+#define MY_MAX 2000000000
+#define MY_MIN -2000000000
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE static
+#endif
+
+int a, b, a_known_value, b_known_value;
+
+int test_omp_for_bigbounds()
+{
+  a = 0;
+  b = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for schedule(MY_SCHEDULE)
+    for (i = INT_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+    }
+    #pragma omp for schedule(MY_SCHEDULE)
+    for (i = INT_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+    }
+  }
+  printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value, b, b_known_value);
+  return (a == a_known_value && b == b_known_value);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  a_known_value = 0;
+  for (i = INT_MIN; i < MY_MAX; i+=INCR) {
+      a_known_value++;
+  }
+
+  b_known_value = 0;
+  for (i = INT_MAX; i >= MY_MIN; i-=INCR) {
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_bigbounds()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_collapse.c b/final/runtime/test/worksharing/for/omp_for_collapse.c
new file mode 100644
index 0000000..a08086d
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_collapse.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function to check that i is increasing monotonically
+   with each call */
+static int check_i_islarger (int i)
+{
+  static int last_i;
+  int islarger;
+  if (i==1)
+    last_i=0;
+  islarger = ((i >= last_i)&&(i - last_i<=1));
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_for_collapse()
+{
+  int is_larger = 1;
+
+  #pragma omp parallel
+  {
+    int i,j;
+    int my_islarger = 1;
+    #pragma omp for private(i,j) schedule(static,1) collapse(2) ordered
+    for (i = 1; i < 100; i++) {
+      for (j =1; j <100; j++) {
+        #pragma omp ordered
+        my_islarger = check_i_islarger(i)&&my_islarger;
+      }
+    }
+    #pragma omp critical
+    is_larger = is_larger && my_islarger;
+  }
+  return (is_larger);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_collapse()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_firstprivate.c b/final/runtime/test/worksharing/for/omp_for_firstprivate.c
new file mode 100644
index 0000000..6c4121c
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_firstprivate.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+  sum1 = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      threadsnum=omp_get_num_threads();
+    }
+    /* sum0 = 0; */
+
+    int i;
+    #pragma omp for firstprivate(sum0)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }  /* end of for */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /* end of critical */
+  }  /* end of parallel */
+  known_sum = 12345* threadsnum+ (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_lastprivate.c b/final/runtime/test/worksharing/for/omp_for_lastprivate.c
new file mode 100644
index 0000000..88694b8
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_lastprivate.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum0;
+#pragma omp threadprivate(sum0)
+
+int test_omp_for_lastprivate()
+{
+  int sum = 0;
+  int known_sum;
+  int i0;
+
+  i0 = -1;
+
+  #pragma omp parallel
+  {
+    sum0 = 0;
+    {  /* Begin of orphaned block */
+      int i;
+      #pragma omp for schedule(static,7) lastprivate(i0)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum0 + i;
+        i0 = i;
+      }  /* end of for */
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    }  /* end of critical */
+  }  /* end of parallel */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  fprintf(stderr, "known_sum = %d , sum = %d\n",known_sum,sum);
+  fprintf(stderr, "LOOPCOUNT = %d , i0 = %d\n",LOOPCOUNT,i0);
+  return ((known_sum == sum) && (i0 == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_nowait.c b/final/runtime/test/worksharing/for/omp_for_nowait.c
new file mode 100644
index 0000000..95a4775
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_nowait.c
@@ -0,0 +1,77 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * This test will hang if the nowait is not working properly.
+ *
+ * It relies on a thread skipping to the second for construct to
+ * release the threads in the first for construct.
+ *
+ * Also, we use static scheduling to guarantee that one
+ * thread will make it to the second for construct.
+ */
+volatile int release;
+volatile int count;
+
+void wait_for_release_then_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d enters first for construct"
+    " and waits.\n", rank);
+  while (release == 0);
+  #pragma omp atomic
+  count++;
+}
+
+void release_and_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d sets release to 1\n", rank);
+  release = 1;
+  #pragma omp atomic
+  count++;
+}
+
+int test_omp_for_nowait()
+{
+  release = 0;
+  count = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    int rank;
+    int i;
+
+    rank = omp_get_thread_num();
+
+    #pragma omp for schedule(static) nowait
+    for (i = 0; i < 4; i++) {
+      if (i < 3)
+        wait_for_release_then_increment(rank);
+      else {
+        fprintf(stderr, "Thread nr %d enters first for and goes "
+          "immediately to the next for construct to release.\n", rank);
+        #pragma omp atomic
+        count++;
+      }
+    }
+
+    #pragma omp for schedule(static)
+    for (i = 0; i < 4; i++) {
+        release_and_increment(rank);
+    }
+  }
+  return (count==8);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_nowait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_ordered.c b/final/runtime/test/worksharing/for/omp_for_ordered.c
new file mode 100644
index 0000000..18ac7eb
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_ordered.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+/* Utility function to check that i is increasing monotonically
+   with each call */
+static int check_i_islarger (int i)
+{
+  int islarger;
+  islarger = (i > last_i);
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_for_ordered()
+{
+  int sum;
+  int is_larger = 1;
+  int known_sum;
+
+  last_i = 0;
+  sum = 0;
+
+  #pragma omp parallel
+  {
+    int i;
+    int my_islarger = 1;
+    #pragma omp for schedule(static,1) ordered
+    for (i = 1; i < 100; i++) {
+      #pragma omp ordered
+      {
+        my_islarger = check_i_islarger(i) && my_islarger;
+        sum = sum + i;
+      }
+    }
+    #pragma omp critical
+    {
+      is_larger = is_larger && my_islarger;
+    }
+  }
+
+  known_sum=(99 * 100) / 2;
+  return ((known_sum == sum) && is_larger);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_ordered()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_private.c b/final/runtime/test/worksharing/for/omp_for_private.c
new file mode 100644
index 0000000..1f537b9
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_private.c
@@ -0,0 +1,63 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+static void do_some_work()
+{
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++){
+  sum += sqrt ((double) i);
+  }
+}
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_private()
+{
+  int sum = 0;
+  int sum0;
+  int known_sum;
+
+  sum0 = 0;  /* setting (global) sum0 = 0 */
+
+  #pragma omp parallel
+  {
+    sum1 = 0;  /* setting sum1 in each thread to 0 */
+    {  /* begin of orphaned block */
+      int i;
+      #pragma omp for private(sum0) schedule(static,1)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum1;
+        #pragma omp flush
+        sum0 = sum0 + i;
+        do_some_work ();
+        #pragma omp flush
+        sum1 = sum0;
+      }
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /*end of critical*/
+  }  /* end of parallel*/
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_reduction.c b/final/runtime/test/worksharing/for/omp_for_reduction.c
new file mode 100644
index 0000000..28f0907
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_reduction.c
@@ -0,0 +1,339 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20  /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_for_reduction ()
+{
+  double dt;
+  int sum;
+  int diff;
+  int product = 1;
+  double dsum;
+  double dknown_sum;
+  double ddiff;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int *logics;
+  int i;
+  int known_sum;
+  int known_product;
+  double rounding_error = 1.E-9; /* over all rounding error to be
+                    ignored in the double tests */
+  double dpt;
+  int result = 0;
+  int logicsArray[LOOPCOUNT];
+
+  /* Variables for integer tests */
+  sum = 0;
+  product = 1;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  /* variabels for double tests */
+  dt = 1. / 3.;  /* base of geometric row for + and - test*/
+  dsum = 0.;
+  /* Variabeles for logic  tests */
+  logics = logicsArray;
+  logic_and = 1;
+  logic_or = 0;
+  /* Variabeles for bit operators tests */
+  bit_and = 1;
+  bit_or = 0;
+  /* Variables for exclusiv bit or */
+  exclusiv_bit_or = 0;
+
+  /************************************************************************/
+  /** Tests for integers                         **/
+  /************************************************************************/
+
+  /**** Testing integer addition ****/
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(+:sum)
+    for (j = 1; j <= LOOPCOUNT; j++) {
+      sum = sum + j;
+    }
+  }
+  if (known_sum != sum) {
+    result++;
+    fprintf (stderr, "Error in sum with integers: Result was %d"
+      " instead of %d.\n", sum, known_sum);
+  }
+
+  /**** Testing integer subtracton ****/
+  diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(-:diff)
+    for (j = 1; j <= LOOPCOUNT; j++) {
+      diff = diff - j;
+    }
+  }
+  if (diff != 0) {
+    result++;
+    fprintf (stderr, "Error in difference with integers: Result was %d"
+      " instead of 0.\n", diff);
+  }
+
+  /**** Testing integer multiplication ****/
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(*:product)
+    for (j = 1; j <= MAX_FACTOR; j++) {
+      product *= j;
+    }
+  }
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf (stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n",product,known_product);
+  }
+
+  /************************************************************************/
+  /** Tests for doubles                          **/
+  /************************************************************************/
+
+  /**** Testing double addition ****/
+  dsum = 0.;
+  dpt = 1.;
+  for (i = 0; i < DOUBLE_DIGITS; ++i) {
+    dpt *= dt;
+  }
+  dknown_sum = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(+:dsum)
+    for (j = 0; j < DOUBLE_DIGITS; j++) {
+      dsum += pow (dt, j);
+    }
+  }
+  if (fabs (dsum - dknown_sum) > rounding_error) {
+    result++;
+    fprintf (stderr, "\nError in sum with doubles: Result was %f"
+      " instead of: %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  /**** Testing double subtraction ****/
+  ddiff = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(-:ddiff)
+    for (j = 0; j < DOUBLE_DIGITS; ++j) {
+      ddiff -= pow (dt, j);
+    }
+  }
+  if (fabs (ddiff) > rounding_error) {
+    result++;
+    fprintf (stderr, "Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n", ddiff);
+  }
+
+
+  /************************************************************************/
+  /** Tests for logical values                       **/
+  /************************************************************************/
+
+  /**** Testing logic and ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&&:logic_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_and = (logic_and && logics[j]);
+    }
+  }
+  if(!logic_and) {
+    result++;
+    fprintf (stderr, "Error in logic AND part 1\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&&:logic_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_and = logic_and && logics[j];
+    }
+  }
+  if(logic_and) {
+    result++;
+    fprintf (stderr, "Error in logic AND part 2\n");
+  }
+
+  /**** Testing logic or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(||:logic_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_or = logic_or || logics[j];
+    }
+  }
+  if (logic_or) {
+    result++;
+    fprintf (stderr, "Error in logic OR part 1\n");
+  }
+
+  logic_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(||:logic_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_or = logic_or || logics[j];
+    }
+  }
+  if(!logic_or) {
+    result++;
+    fprintf (stderr, "Error in logic OR part 2\n");
+  }
+
+  /************************************************************************/
+  /** Tests for bit values                         **/
+  /************************************************************************/
+
+  /**** Testing bit and ****/
+  for (i = 0; i < LOOPCOUNT; ++i) {
+    logics[i] = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&:bit_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_and = (bit_and & logics[j]);
+    }
+  }
+  if (!bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 1\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&:bit_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_and = bit_and & logics[j];
+    }
+  }
+  if (bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 2\n");
+  }
+
+  /**** Testing bit or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(|:bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_or = bit_or | logics[j];
+    }
+  }
+  if (bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 1\n");
+  }
+
+  bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(|:bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_or = bit_or | logics[j];
+    }
+  }
+  if (!bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 2\n");
+  }
+
+  /**** Testing exclusive bit or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(^:exclusiv_bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+    }
+  }
+  if (exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(^:exclusiv_bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+    }
+  }
+  if (!exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  return (result == 0);
+  free (logics);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_auto.c b/final/runtime/test/worksharing/for/omp_for_schedule_auto.c
new file mode 100644
index 0000000..075617c
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_auto.c
@@ -0,0 +1,69 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_auto()
+{
+  int j;
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+
+  // array which keeps track of which threads participated in the for loop
+  // e.g., given 4 threads, [ 0 | 1 | 1 | 0 ] implies
+  //       threads 0 and 3 did not, threads 1 and 2 did
+  int max_threads = omp_get_max_threads();
+  int* active_threads = (int*)malloc(sizeof(int)*max_threads);
+  for(j = 0; j < max_threads; j++)
+    active_threads[j] = 0;
+
+  #pragma omp parallel
+  {
+    int i;
+    sum1 = 0;
+    #pragma omp for firstprivate(sum0) schedule(auto)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      active_threads[omp_get_thread_num()] = 1;
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }
+  }
+
+  // count the threads that participated (sum is stored in threadsnum)
+  threadsnum=0;
+  for(j = 0; j < max_threads; j++) {
+    if(active_threads[j])
+      threadsnum++;
+  }
+  free(active_threads);
+
+  known_sum = 12345 * threadsnum + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_auto()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c b/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
new file mode 100644
index 0000000..6d4f59b
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
@@ -0,0 +1,89 @@
+// RUN: %libomp-compile-and-run
+/*
+ * Test for dynamic scheduling with chunk size
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested chunk size
+ *     unless it is the last one.
+ * It is possible for two adjacent chunks are assigned to the same thread
+ * Modified by Chunhua Liao
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 100
+const int chunk_size = 7;
+
+int test_omp_for_schedule_dynamic()
+{
+  int tid;
+  int *tids;
+  int i;
+  int tidsArray[CFDMAX_SIZE];
+  int count = 0;
+  int tmp_count = 0; /*dispatch times*/
+  int *tmp;  /*store chunk size for each dispatch*/
+  int result = 0;
+
+  tids = tidsArray;
+
+  #pragma omp parallel private(tid) shared(tids)
+  {        /* begin of parallel */
+    int tid;
+    tid = omp_get_thread_num ();
+    #pragma omp for schedule(dynamic,chunk_size)
+    for (i = 0; i < CFDMAX_SIZE; i++) {
+      tids[i] = tid;
+    }
+  }
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+    if (tids[i] != tids[i + 1]) {
+      count++;
+    }
+  }
+
+  tmp = (int *) malloc (sizeof (int) * (count + 1));
+  tmp[0] = 1;
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+    if (tmp_count > count) {
+      printf ("--------------------\nTestinternal Error: List too small!!!\n--------------------\n");  /* Error handling */
+      break;
+    }
+    if (tids[i] != tids[i + 1]) {
+      tmp_count++;
+      tmp[tmp_count] = 1;
+    } else {
+      tmp[tmp_count]++;
+    }
+  }
+  /* is dynamic statement working? */
+  for (i = 0; i < count; i++) {
+    if ((tmp[i]%chunk_size)!=0) {
+      /* it is possible for 2 adjacent chunks assigned to a same thread */
+      result++;
+      fprintf(stderr,"The intermediate dispatch has wrong chunksize.\n");
+      /* result += ((tmp[i] / chunk_size) - 1); */
+    }
+  }
+  if ((tmp[count]%chunk_size)!=(CFDMAX_SIZE%chunk_size)) {
+    result++;
+    fprintf(stderr,"the last dispatch has wrong chunksize.\n");
+  }
+  /* for (int i=0;i<count+1;++i) printf("%d\t:=\t%d\n",i+1,tmp[i]); */
+  return (result==0);
+}
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_dynamic()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_guided.c b/final/runtime/test/worksharing/for/omp_for_schedule_guided.c
new file mode 100644
index 0000000..1ee7449
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_guided.c
@@ -0,0 +1,217 @@
+// RUN: %libomp-compile-and-run
+
+/* Test for guided scheduling
+ * Ensure threads get chunks interleavely first
+ * Then judge the chunk sizes are decreasing to a stable value
+ * Modified by Chunhua Liao
+ * For example, 100 iteration on 2 threads, chunksize 7
+ * one line for each dispatch, 0/1 means thread id
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  24
+ * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1        18
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0            14
+ * 1 1 1 1 1 1 1 1 1 1                10
+ * 0 0 0 0 0 0 0 0                   8
+ * 1 1 1 1 1 1 1                   7
+ * 0 0 0 0 0 0 0                   7
+ * 1 1 1 1 1 1 1                   7
+ * 0 0 0 0 0                     5
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME  0.005
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0001
+#endif
+
+int test_omp_for_schedule_guided()
+{
+  int * tids;
+  int * chunksizes;
+  int notout;
+  int maxiter;
+  int threads;
+  int i;
+  int result;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  maxiter = 0;
+  result = 1;
+  notout = 1;
+
+  /* Testing if enough threads are available for this check. */
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      threads = omp_get_num_threads();
+    }
+  }
+
+  /* ensure there are at least two threads */
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+
+  /* Now the real parallel work:
+   * Each thread will start immediately with the first chunk.
+   */
+  #pragma omp parallel shared(tids,maxiter)
+  {  /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(guided)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      #pragma omp flush(maxiter,notout)
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+#ifdef VERBOSE
+        printf(".");
+#endif
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of for */
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /*******************************************************
+   * evaluation of the values              *
+   *******************************************************/
+  {
+    int determined_chunksize = 1;
+    int last_threadnr = tids[0];
+    int global_chunknr = 0;
+    int openwork = CFSMAX_SIZE;
+    int expected_chunk_size;
+    int* local_chunknr = (int*)malloc(threads * sizeof(int));
+    double c = 1;
+
+    for (i = 0; i < threads; i++)
+      local_chunknr[i] = 0;
+
+    tids[CFSMAX_SIZE] = -1;
+
+    /*
+     * determine the number of global chunks
+     */
+    // fprintf(stderr,"# global_chunknr thread local_chunknr chunksize\n");
+    for(i = 1; i <= CFSMAX_SIZE; ++i) {
+      if (last_threadnr==tids[i]) {
+        determined_chunksize++;
+      } else {
+        /* fprintf(stderr, "%d\t%d\t%d\t%d\n", global_chunknr,
+           last_threadnr, local_chunknr[last_threadnr], m); */
+        global_chunknr++;
+        local_chunknr[last_threadnr]++;
+        last_threadnr = tids[i];
+        determined_chunksize = 1;
+      }
+    }
+    /* now allocate the memory for saving the sizes of the global chunks */
+    chunksizes = (int*)malloc(global_chunknr * sizeof(int));
+
+    /*
+    * Evaluate the sizes of the global chunks
+    */
+    global_chunknr = 0;
+    determined_chunksize = 1;
+    last_threadnr = tids[0];
+    for (i = 1; i <= CFSMAX_SIZE; ++i) {
+      /* If the threadnumber was the same as before increase the
+       * detected chunksize for this chunk otherwise set the detected
+       * chunksize again to one and save the number of the next
+       * thread in last_threadnr.
+       */
+      if (last_threadnr == tids[i]) {
+        determined_chunksize++;
+      } else {
+        chunksizes[global_chunknr] = determined_chunksize;
+        global_chunknr++;
+        local_chunknr[last_threadnr]++;
+        last_threadnr = tids[i];
+        determined_chunksize = 1;
+      }
+    }
+
+#ifdef VERBOSE
+    fprintf(stderr, "found\texpected\tconstant\n");
+#endif
+
+    /* identify the constant c for the exponential
+       decrease of the chunksize */
+    expected_chunk_size = openwork / threads;
+    c = (double) chunksizes[0] / expected_chunk_size;
+
+    for (i = 0; i < global_chunknr; i++) {
+      /* calculate the new expected chunksize */
+      if (expected_chunk_size > 1)
+        expected_chunk_size = c * openwork / threads;
+#ifdef VERBOSE
+      fprintf(stderr, "%8d\t%8d\t%lf\n", chunksizes[i],
+        expected_chunk_size, c * chunksizes[i]/expected_chunk_size);
+#endif
+      /* check if chunksize is inside the rounding errors */
+      if (abs (chunksizes[i] - expected_chunk_size) >= 2) {
+        result = 0;
+#ifndef VERBOSE
+        fprintf(stderr, "Chunksize differed from expected "
+          "value: %d instead of %d\n", chunksizes[i],
+          expected_chunk_size);
+        return 0;
+#endif
+      } /* end if */
+
+#ifndef VERBOSE
+      if (expected_chunk_size - chunksizes[i] < 0)
+        fprintf(stderr, "Chunksize did not decrease: %d"
+          " instead of %d\n", chunksizes[i],expected_chunk_size);
+#endif
+
+      /* calculating the remaining amount of work */
+      openwork -= chunksizes[i];
+    }
+  }
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_guided()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c b/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c
new file mode 100644
index 0000000..b957fc3
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c
@@ -0,0 +1,82 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=static %libomp-run 1 0
+// RUN: env OMP_SCHEDULE=static,10 %libomp-run 1 10
+// RUN: env OMP_SCHEDULE=dynamic %libomp-run 2 1
+// RUN: env OMP_SCHEDULE=dynamic,11 %libomp-run 2 11
+// RUN: env OMP_SCHEDULE=guided %libomp-run 3 1
+// RUN: env OMP_SCHEDULE=guided,12 %libomp-run 3 12
+// RUN: env OMP_SCHEDULE=auto %libomp-run 4 1
+// RUN: env OMP_SCHEDULE=trapezoidal %libomp-run 101 1
+// RUN: env OMP_SCHEDULE=trapezoidal,13 %libomp-run 101 13
+// RUN: env OMP_SCHEDULE=static_steal %libomp-run 102 1
+// RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 102 14
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum;
+char* correct_kind_string;
+omp_sched_t correct_kind;
+int correct_chunk_size;
+
+int test_omp_for_runtime()
+{
+  int sum;
+  int known_sum;
+  int chunk_size;
+  int error;
+  omp_sched_t kind;
+
+  sum = 0;
+  error = 0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  omp_get_schedule(&kind, &chunk_size);
+
+  printf("omp_get_schedule() returns: Schedule = %d, Chunk Size = %d\n",
+         kind, chunk_size);
+  if (kind != correct_kind) {
+    printf("kind(%d) != correct_kind(%d)\n", kind, correct_kind);
+    error = 1;
+  }
+  if (chunk_size != correct_chunk_size) {
+    printf("chunk_size(%d) != correct_chunk_size(%d)\n", chunk_size,
+           correct_chunk_size);
+    error = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for schedule(runtime)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+        #pragma omp critical
+        sum+=i;
+    }
+  }
+  if (known_sum != sum) {
+    printf("Known Sum = %d, Calculated Sum = %d\n", known_sum, sum);
+    error = 1;
+  }
+  return !error;
+}
+
+int main(int argc, char** argv)
+{
+  int i;
+  int num_failed=0;
+  if (argc != 3) {
+    fprintf(stderr, "usage: %s schedule_kind chunk_size\n", argv[0]);
+    fprintf(stderr, "  Run with envirable OMP_SCHEDULE=kind[,chunk_size]\n");
+    return 1;
+  }
+  correct_kind = atoi(argv[1]);
+  correct_chunk_size = atoi(argv[2]);
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_for_runtime()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_static.c b/final/runtime/test/worksharing/for/omp_for_schedule_static.c
new file mode 100644
index 0000000..f46a544
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_static.c
@@ -0,0 +1,154 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+int test_omp_for_schedule_static()
+{
+  int threads;
+  int i,lasttid;
+  int * tids;
+  int notout;
+  int maxiter;
+  int chunk_size;
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+
+  chunk_size = 7;
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+  #pragma omp parallel shared(tids,counter)
+  {  /* begin of parallel*/
+    #pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }  /* end of single */
+  }  /* end of parallel */
+
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+  fprintf (stderr,"Using an internal count of %d\nUsing a specified"
+    " chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+  tids[CFSMAX_SIZE] = -1;  /* setting endflag */
+  #pragma omp parallel shared(tids)
+  { /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(static,chunk_size)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+        printf(".");
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of for */
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /**** analysing the data in array tids ****/
+
+  lasttid = tids[0];
+  tmp_count = 0;
+
+  for (i = 0; i < CFSMAX_SIZE + 1; ++i) {
+    /* If the work  was done by the same thread increase tmp_count by one. */
+    if (tids[i] == lasttid) {
+      tmp_count++;
+#ifdef VERBOSE
+      fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+      continue;
+    }
+
+    /* Check if the next thread had has the right thread number. When finding
+     * threadnumber -1 the end should be reached.
+     */
+    if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+      /* checking for the right chunk size */
+      if (tmp_count == chunk_size) {
+        tmp_count = 1;
+        lasttid = tids[i];
+#ifdef VERBOSE
+        fprintf (stderr, "OK\n");
+#endif
+      } else {
+        /* If the chunk size was wrong, check if the end was reached */
+        if (tids[i] == -1) {
+          if (i == CFSMAX_SIZE) {
+            fprintf (stderr, "Last thread had chunk size %d\n",
+              tmp_count);
+            break;
+          } else {
+            fprintf (stderr, "ERROR: Last thread (thread with"
+              " number -1) was found before the end.\n");
+            result = 0;
+          }
+        } else {
+          fprintf (stderr, "ERROR: chunk size was %d. (assigned"
+            " was %d)\n", tmp_count, chunk_size);
+          result = 0;
+        }
+      }
+    } else {
+      fprintf(stderr, "ERROR: Found thread with number %d (should be"
+        " inbetween 0 and %d).", tids[i], threads - 1);
+      result = 0;
+    }
+#ifdef VERBOSE
+    fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+  }
+
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_static()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c b/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c
new file mode 100644
index 0000000..922f27a
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c
@@ -0,0 +1,202 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+#define VERBOSE 0
+
+int test_omp_for_schedule_static_3()
+{
+  int threads;
+  int i,lasttid;
+
+  int * tids;
+  int * tids2;
+  int notout;
+  int maxiter;
+  int chunk_size;
+
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+  chunk_size = 7;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+  #pragma omp parallel shared(tids,counter)
+  {  /* begin of parallel*/
+    #pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }  /* end of single */
+  }  /* end of parallel */
+
+  /* Ensure that at least two threads are created */
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+  fprintf (stderr,"Using an internal count of %d\nUsing a"
+    " specified chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+  tids[CFSMAX_SIZE] = -1;  /* setting endflag */
+
+  #pragma omp parallel shared(tids)
+  {  /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(static,chunk_size)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+        printf(".");
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of omp parallel for */
+
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /**** analysing the data in array tids ****/
+
+  lasttid = tids[0];
+  tmp_count = 0;
+
+  for (i = 0; i < CFSMAX_SIZE + 1; ++i) {
+    /* If the work  was done by the same thread
+       increase tmp_count by one. */
+    if (tids[i] == lasttid) {
+      tmp_count++;
+#ifdef VERBOSE
+      fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+      continue;
+    }
+
+    /* Check if the next thread had has the right thread number.
+     * When finding threadnumber -1 the end should be reached.
+     */
+    if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+      /* checking for the right chunk size */
+      if (tmp_count == chunk_size) {
+        tmp_count = 1;
+        lasttid = tids[i];
+#ifdef VERBOSE
+        fprintf (stderr, "OK\n");
+#endif
+      } else {
+        /* If the chunk size was wrong, check if the end was reached */
+        if (tids[i] == -1) {
+          if (i == CFSMAX_SIZE) {
+            fprintf (stderr, "Last thread had chunk size %d\n",
+              tmp_count);
+            break;
+          } else {
+            fprintf (stderr, "ERROR: Last thread (thread with"
+              " number -1) was found before the end.\n");
+            result = 0;
+          }
+        } else {
+          fprintf (stderr, "ERROR: chunk size was %d. (assigned"
+            " was %d)\n", tmp_count, chunk_size);
+          result = 0;
+        }
+      }
+    } else {
+      fprintf(stderr, "ERROR: Found thread with number %d (should be"
+        " inbetween 0 and %d).", tids[i], threads - 1);
+      result = 0;
+    }
+#ifdef VERBOSE
+    fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+  }
+
+  /* Now we check if several loop regions in one parallel region have the
+   * same logical assignement of chunks to threads. We use the nowait
+   * clause to increase the probability to get an error. */
+
+  /* First we allocate some more memmory */
+  free (tids);
+  tids = (int *) malloc (sizeof (int) * LOOPCOUNT);
+  tids2 = (int *) malloc (sizeof (int) * LOOPCOUNT);
+
+  #pragma omp parallel
+  {
+    {
+      int n;
+      #pragma omp for schedule(static) nowait
+      for (n = 0; n < LOOPCOUNT; n++) {
+        if (LOOPCOUNT == n + 1 )
+          my_sleep(SLEEPTIME);
+
+        tids[n] = omp_get_thread_num();
+      }
+    }
+    {
+      int m;
+      #pragma omp for schedule(static) nowait
+      for (m = 1; m <= LOOPCOUNT; m++) {
+        tids2[m-1] = omp_get_thread_num();
+      }
+    }
+  }
+
+  for (i = 0; i < LOOPCOUNT; i++)
+  if (tids[i] != tids2[i]) {
+    fprintf (stderr, "Chunk no. %d was assigned once to thread %d and"
+      " later to thread %d.\n", i, tids[i],tids2[i]);
+    result = 0;
+  }
+
+  free (tids);
+  free (tids2);
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_static_3()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c b/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c
new file mode 100644
index 0000000..3b3bf7d
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_firstprivate()
+{
+  int sum ;
+  int i2;
+  int i;
+  int known_sum;
+
+  sum=0;
+  i2=3;
+
+  #pragma omp parallel for reduction(+:sum) private(i) firstprivate(i2)
+  for (i = 1; i <= LOOPCOUNT; i++) {
+    sum = sum + (i + i2);
+  }
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 + i2 * LOOPCOUNT;
+  return (known_sum == sum);
+} /* end of check_parallel_for_fistprivate */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_if.c b/final/runtime/test/worksharing/for/omp_parallel_for_if.c
new file mode 100644
index 0000000..57fe498
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_if.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_if()
+{
+  int known_sum;
+  int num_threads;
+  int sum, sum2;
+  int i;
+  int control;
+
+  control = 0;
+  num_threads=0;
+  sum = 0;
+  sum2 = 0;
+
+  #pragma omp parallel for private(i) if (control==1)
+  for (i=0; i <= LOOPCOUNT; i++) {
+    num_threads = omp_get_num_threads();
+    sum = sum + i;
+  }
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  fprintf(stderr, "Number of threads determined by"
+    " omp_get_num_threads: %d\n", num_threads);
+  return (known_sum == sum && num_threads == 1);
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c b/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c
new file mode 100644
index 0000000..a53cfb2
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_lastprivate()
+{
+  int sum;
+  int i;
+  int i0;
+  int known_sum;
+
+  sum =0;
+  i0 = -1;
+
+  #pragma omp parallel for reduction(+:sum) \
+    schedule(static,7) private(i) lastprivate(i0)
+  for (i = 1; i <= LOOPCOUNT; i++) {
+    sum = sum + i;
+    i0 = i;
+  } /* end of parallel for */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return ((known_sum == sum) && (i0 == LOOPCOUNT));
+} /* end of check_parallel_for_lastprivate */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c b/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c
new file mode 100644
index 0000000..5fef460
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+int i;
+#pragma omp threadprivate(i)
+
+/* Variable ii is used to avoid problems with a threadprivate variable used as a loop
+ * index. See test omp_threadprivate_for.
+ */
+static int ii;
+#pragma omp threadprivate(ii)
+
+/*!
+  Utility function: returns true if the passed argument is larger than
+  the argument of the last call of this function.
+ */
+static int check_i_islarger2(int i)
+{
+  int islarger;
+  islarger = (i > last_i);
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_parallel_for_ordered()
+{
+  int sum;
+  int is_larger;
+  int known_sum;
+  int i;
+
+  sum = 0;
+  is_larger = 1;
+  last_i = 0;
+  #pragma omp parallel for schedule(static,1) private(i) ordered
+  for (i = 1; i < 100; i++) {
+    ii = i;
+    #pragma omp ordered
+    {
+      is_larger = check_i_islarger2 (ii) && is_larger;
+      sum  = sum + ii;
+    }
+  }
+  known_sum = (99 * 100) / 2;
+  fprintf (stderr," known_sum = %d , sum = %d \n", known_sum, sum);
+  fprintf (stderr," is_larger = %d\n", is_larger);
+  return (known_sum == sum) && is_larger;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_ordered()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_private.c b/final/runtime/test/worksharing/for/omp_parallel_for_private.c
new file mode 100644
index 0000000..1231d36
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_private.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/*! Utility function to spend some time in a loop */
+static void do_some_work (void)
+{
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++){
+    sum += sqrt (i);
+  }
+}
+
+int test_omp_parallel_for_private()
+{
+  int sum;
+  int i;
+  int i2;
+  int known_sum;
+
+  sum =0;
+  i2=0;
+
+  #pragma omp parallel for reduction(+:sum) schedule(static,1) private(i) private(i2)
+  for (i=1;i<=LOOPCOUNT;i++)
+  {
+    i2 = i;
+    #pragma omp flush
+    do_some_work ();
+    #pragma omp flush
+    sum = sum + i2;
+  } /*end of for*/
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c b/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c
new file mode 100644
index 0000000..118d730
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c
@@ -0,0 +1,266 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20    /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_parallel_for_reduction()
+{
+  int sum;
+  int known_sum;
+  double dsum;
+  double dknown_sum;
+  double dt=0.5;  /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-9;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[LOOPCOUNT];
+  int i;
+  double dpt;
+  int result;
+
+  sum =0;
+  dsum=0;
+  dt = 1./3.;
+  result = 0;
+  product = 1;
+  logic_and=1;
+  logic_or=0;
+  bit_and=1;
+  bit_or=0;
+  exclusiv_bit_or=0;
+
+  /* Tests for integers */
+  known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:sum)
+  for (i=1;i<=LOOPCOUNT;i++) {
+    sum=sum+i;
+  }
+  if(known_sum!=sum) {
+    result++;
+    fprintf(stderr,"Error in sum with integers: Result was %d"
+      " instead of %d\n",sum,known_sum);
+  }
+
+  diff = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:diff)
+  for (i=1;i<=LOOPCOUNT;++i) {
+    diff=diff-i;
+  }
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in difference with integers: Result was %d"
+      " instead of 0.\n",diff);
+  }
+
+  /* Tests for doubles */
+  dsum=0;
+  dpt=1;
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:dsum)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dsum += pow(dt,i);
+  }
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f"
+      " instead of %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  dpt=1;
+
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:ddiff)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    ddiff -= pow(dt,i);
+  }
+  if( fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n",ddiff);
+  }
+
+  /* Tests for integers */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(*:product)
+  for(i=1;i<=MAX_FACTOR;i++) {
+    product *= i;
+  }
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n\n",product,known_product);
+  }
+
+  /* Tests for logic AND */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = (logic_and && logics[i]);
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1.\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = logic_and && logics[i];
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2.\n");
+  }
+
+  /* Tests for logic OR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 1.\n");
+  }
+  logic_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2.\n");
+  }
+
+  /* Tests for bitwise AND */
+  for(i=0;i<LOOPCOUNT;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = (bit_and & logics[i]);
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1.\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = bit_and & logics[i];
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2.\n");
+  }
+
+  /* Tests for bitwise OR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  /* Tests for bitwise XOR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}