// RUN: %libomp-compile-and-run // REQUIRES: openmp-4.5 /* Test for the 'schedule(simd:guided)' clause. Compiler needs to generate a dynamic dispatching and pass the schedule value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations. */ #include #include #if defined(WIN32) || defined(_WIN32) #include #define delay() Sleep(1); #else #include #define delay() usleep(10); #endif // uncomment for debug diagnostics: //#define DEBUG #define SIMD_LEN 4 // --------------------------------------------------------------------------- // Various definitions copied from OpenMP RTL enum sched { kmp_sch_static_balanced_chunked = 45, kmp_sch_guided_simd = 46, kmp_sch_runtime_simd = 47, }; typedef unsigned u32; typedef long long i64; typedef unsigned long long u64; typedef struct { int reserved_1; int flags; int reserved_2; int reserved_3; char *psource; } id; extern int __kmpc_global_thread_num(id*); extern void __kmpc_barrier(id*, int gtid); extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int); extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64); extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*); extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*); // End of definitions copied from OpenMP RTL. // --------------------------------------------------------------------------- static id loc = {0, 2, 0, 0, ";file;func;0;0;;"}; // --------------------------------------------------------------------------- int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) { int err = 0; static int volatile loop_sync = 0; i64 lb; // Chunk lower bound i64 ub; // Chunk upper bound i64 st; // Chunk stride int rc; int tid = omp_get_thread_num(); int gtid = tid; int last; #if DEBUG printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n", (int)sizeof(i64), gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk); #endif // Don't test degenerate cases that should have been discovered by codegen if (loop_st == 0) return 0; if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) return 0; __kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd, loop_lb, loop_ub, loop_st, loop_chunk); if (tid == 0) { // Let the master thread handle the chunks alone int chunk; // No of current chunk i64 next_lb; // Lower bound of the next chunk i64 last_ub; // Upper bound of the last processed chunk u64 cur; // Number of interations in current chunk u64 max; // Max allowed iterations for current chunk int undersized = 0; chunk = 0; next_lb = loop_lb; max = (loop_ub - loop_lb) / loop_st + 1; // The first chunk can consume all iterations while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) { ++ chunk; #if DEBUG printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub); #endif // Check if previous chunk (it is not the final chunk) is undersized if (undersized) { printf("Error with chunk %d\n", chunk); err++; } // Check lower and upper bounds if (lb != next_lb) { printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk); err++; } if (loop_st > 0) { if (!(ub <= loop_ub)) { printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk); err++; } if (!(lb <= ub)) { printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); err++; } } else { if (!(ub >= loop_ub)) { printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk); err++; } if (!(lb >= ub)) { printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); err++; } }; // if // Stride should not change if (!(st == loop_st)) { printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk); err++; } cur = (ub - lb) / loop_st + 1; // Guided scheduling uses FP computations, so current chunk may // be a bit bigger (+1) than allowed maximum if (!(cur <= max + 1)) { printf("Error with iter %d, %d\n", cur, max); err++; } // Update maximum for the next chunk if (cur < max) max = cur; next_lb = ub + loop_st; last_ub = ub; undersized = (cur < loop_chunk); }; // while // Must have at least one chunk if (!(chunk > 0)) { printf("Error with chunk %d\n", chunk); err++; } // Must have the right last iteration index if (loop_st > 0) { if (!(last_ub <= loop_ub)) { printf("Error with last1 %d, %d, ch %d\n", (int)last_ub, (int)loop_ub, chunk); err++; } if (!(last_ub + loop_st > loop_ub)) { printf("Error with last2 %d, %d, %d, ch %d\n", (int)last_ub, (int)loop_st, (int)loop_ub, chunk); err++; } } else { if (!(last_ub >= loop_ub)) { printf("Error with last1 %d, %d, ch %d\n", (int)last_ub, (int)loop_ub, chunk); err++; } if (!(last_ub + loop_st < loop_ub)) { printf("Error with last2 %d, %d, %d, ch %d\n", (int)last_ub, (int)loop_st, (int)loop_ub, chunk); err++; } }; // if // Let non-master threads go loop_sync = 1; } else { int i; // Workers wait for master thread to finish, then call __kmpc_dispatch_next for (i = 0; i < 1000000; ++ i) { if (loop_sync != 0) { break; }; // if }; // for i while (loop_sync == 0) { delay(); }; // while // At this moment we do not have any more chunks -- all the chunks already // processed by master thread rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st); if (rc) { printf("Error return value\n"); err++; } }; // if __kmpc_barrier(&loc, gtid); if (tid == 0) { loop_sync = 0; // Restore original state #if DEBUG printf("run_loop_64(): at the end\n"); #endif }; // if __kmpc_barrier(&loc, gtid); return err; } // run_loop // --------------------------------------------------------------------------- int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) { int err = 0; static int volatile loop_sync = 0; int lb; // Chunk lower bound int ub; // Chunk upper bound int st; // Chunk stride int rc; int tid = omp_get_thread_num(); int gtid = tid; int last; #if DEBUG printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n", (int)sizeof(int), gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk); #endif // Don't test degenerate cases that should have been discovered by codegen if (loop_st == 0) return 0; if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) return 0; __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd, loop_lb, loop_ub, loop_st, loop_chunk); if (tid == 0) { // Let the master thread handle the chunks alone int chunk; // No of current chunk int next_lb; // Lower bound of the next chunk int last_ub; // Upper bound of the last processed chunk u64 cur; // Number of interations in current chunk u64 max; // Max allowed iterations for current chunk int undersized = 0; chunk = 0; next_lb = loop_lb; max = (loop_ub - loop_lb) / loop_st + 1; // The first chunk can consume all iterations while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) { ++ chunk; #if DEBUG printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub); #endif // Check if previous chunk (it is not the final chunk) is undersized if (undersized) { printf("Error with chunk %d\n", chunk); err++; } // Check lower and upper bounds if (lb != next_lb) { printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk); err++; } if (loop_st > 0) { if (!(ub <= loop_ub)) { printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk); err++; } if (!(lb <= ub)) { printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); err++; } } else { if (!(ub >= loop_ub)) { printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk); err++; } if (!(lb >= ub)) { printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); err++; } }; // if // Stride should not change if (!(st == loop_st)) { printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk); err++; } cur = (ub - lb) / loop_st + 1; // Guided scheduling uses FP computations, so current chunk may // be a bit bigger (+1) than allowed maximum if (!(cur <= max + 1)) { printf("Error with iter %d, %d\n", cur, max); err++; } // Update maximum for the next chunk if (cur < max) max = cur; next_lb = ub + loop_st; last_ub = ub; undersized = (cur < loop_chunk); }; // while // Must have at least one chunk if (!(chunk > 0)) { printf("Error with chunk %d\n", chunk); err++; } // Must have the right last iteration index if (loop_st > 0) { if (!(last_ub <= loop_ub)) { printf("Error with last1 %d, %d, ch %d\n", (int)last_ub, (int)loop_ub, chunk); err++; } if (!(last_ub + loop_st > loop_ub)) { printf("Error with last2 %d, %d, %d, ch %d\n", (int)last_ub, (int)loop_st, (int)loop_ub, chunk); err++; } } else { if (!(last_ub >= loop_ub)) { printf("Error with last1 %d, %d, ch %d\n", (int)last_ub, (int)loop_ub, chunk); err++; } if (!(last_ub + loop_st < loop_ub)) { printf("Error with last2 %d, %d, %d, ch %d\n", (int)last_ub, (int)loop_st, (int)loop_ub, chunk); err++; } }; // if // Let non-master threads go loop_sync = 1; } else { int i; // Workers wait for master thread to finish, then call __kmpc_dispatch_next for (i = 0; i < 1000000; ++ i) { if (loop_sync != 0) { break; }; // if }; // for i while (loop_sync == 0) { delay(); }; // while // At this moment we do not have any more chunks -- all the chunks already // processed by the master thread rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st); if (rc) { printf("Error return value\n"); err++; } }; // if __kmpc_barrier(&loc, gtid); if (tid == 0) { loop_sync = 0; // Restore original state #if DEBUG printf("run_loop<>(): at the end\n"); #endif }; // if __kmpc_barrier(&loc, gtid); return err; } // run_loop // --------------------------------------------------------------------------- int run_64(int num_th) { int err = 0; #pragma omp parallel num_threads(num_th) { int chunk; i64 st, lb, ub; for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) { for (st = 1; st <= 3; ++ st) { for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) { for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) { err += run_loop_64(lb, ub, st, chunk); err += run_loop_64(ub, lb, -st, chunk); }; // for ub }; // for lb }; // for st }; // for chunk } return err; } // run_all int run_32(int num_th) { int err = 0; #pragma omp parallel num_threads(num_th) { int chunk, st, lb, ub; for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) { for (st = 1; st <= 3; ++ st) { for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) { for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) { err += run_loop_32(lb, ub, st, chunk); err += run_loop_32(ub, lb, -st, chunk); }; // for ub }; // for lb }; // for st }; // for chunk } return err; } // run_all // --------------------------------------------------------------------------- int main() { int n, err = 0; for (n = 1; n <= 4; ++ n) { err += run_32(n); err += run_64(n); }; // for n if (err) printf("failed with %d errors\n", err); else printf("passed\n"); return err; }