diff options
Diffstat (limited to 'final/runtime/src/kmp_itt.inl')
-rw-r--r-- | final/runtime/src/kmp_itt.inl | 1043 |
1 files changed, 1043 insertions, 0 deletions
diff --git a/final/runtime/src/kmp_itt.inl b/final/runtime/src/kmp_itt.inl new file mode 100644 index 0000000..a0789fc --- /dev/null +++ b/final/runtime/src/kmp_itt.inl @@ -0,0 +1,1043 @@ +#if USE_ITT_BUILD +/* + * kmp_itt.inl -- Inline functions of ITT Notify. + */ + +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// + +// Inline function definitions. This file should be included into kmp_itt.h file +// for production build (to let compliler inline functions) or into kmp_itt.c +// file for debug build (to reduce the number of files to recompile and save +// build time). + +#include "kmp.h" +#include "kmp_str.h" + +#if KMP_ITT_DEBUG +extern kmp_bootstrap_lock_t __kmp_itt_debug_lock; +#define KMP_ITT_DEBUG_LOCK() \ + { __kmp_acquire_bootstrap_lock(&__kmp_itt_debug_lock); } +#define KMP_ITT_DEBUG_PRINT(...) \ + { \ + fprintf(stderr, "#%02d: ", __kmp_get_gtid()); \ + fprintf(stderr, __VA_ARGS__); \ + fflush(stderr); \ + __kmp_release_bootstrap_lock(&__kmp_itt_debug_lock); \ + } +#else +#define KMP_ITT_DEBUG_LOCK() +#define KMP_ITT_DEBUG_PRINT(...) +#endif // KMP_ITT_DEBUG + +// Ensure that the functions are static if they're supposed to be being inlined. +// Otherwise they cannot be used in more than one file, since there will be +// multiple definitions. +#if KMP_DEBUG +#define LINKAGE +#else +#define LINKAGE static inline +#endif + +// ZCA interface used by Intel(R) Inspector. Intel(R) Parallel Amplifier uses +// this API to support user-defined synchronization primitives, but does not use +// ZCA; it would be safe to turn this off until wider support becomes available. +#if USE_ITT_ZCA +#ifdef __INTEL_COMPILER +#if __INTEL_COMPILER >= 1200 +#undef __itt_sync_acquired +#undef __itt_sync_releasing +#define __itt_sync_acquired(addr) \ + __notify_zc_intrinsic((char *)"sync_acquired", addr) +#define __itt_sync_releasing(addr) \ + __notify_intrinsic((char *)"sync_releasing", addr) +#endif +#endif +#endif + +static kmp_bootstrap_lock_t metadata_lock = + KMP_BOOTSTRAP_LOCK_INITIALIZER(metadata_lock); + +/* Parallel region reporting. + * __kmp_itt_region_forking should be called by master thread of a team. + Exact moment of call does not matter, but it should be completed before any + thread of this team calls __kmp_itt_region_starting. + * __kmp_itt_region_starting should be called by each thread of a team just + before entering parallel region body. + * __kmp_itt_region_finished should be called by each thread of a team right + after returning from parallel region body. + * __kmp_itt_region_joined should be called by master thread of a team, after + all threads called __kmp_itt_region_finished. + + Note: Thread waiting at join barrier (after __kmp_itt_region_finished) can + execute some more user code -- such a thread can execute tasks. + + Note: The overhead of logging region_starting and region_finished in each + thread is too large, so these calls are not used. */ + +LINKAGE void __kmp_itt_region_forking(int gtid, int team_size, int barriers) { +#if USE_ITT_NOTIFY + kmp_team_t *team = __kmp_team_from_gtid(gtid); + if (team->t.t_active_level > 1) { + // The frame notifications are only supported for the outermost teams. + return; + } + ident_t *loc = __kmp_thread_from_gtid(gtid)->th.th_ident; + if (loc) { + // Use the reserved_2 field to store the index to the region domain. + // Assume that reserved_2 contains zero initially. Since zero is special + // value here, store the index into domain array increased by 1. + if (loc->reserved_2 == 0) { + if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) { + int frm = + KMP_TEST_THEN_INC32(&__kmp_region_domain_count); // get "old" value + if (frm >= KMP_MAX_FRAME_DOMAINS) { + KMP_TEST_THEN_DEC32(&__kmp_region_domain_count); // revert the count + return; // loc->reserved_2 is still 0 + } + // if (!KMP_COMPARE_AND_STORE_ACQ32( &loc->reserved_2, 0, frm + 1 )) { + // frm = loc->reserved_2 - 1; // get value saved by other thread + // for same loc + //} // AC: this block is to replace next unsynchronized line + + // We need to save indexes for both region and barrier frames. We'll use + // loc->reserved_2 field but put region index to the low two bytes and + // barrier indexes to the high two bytes. It is OK because + // KMP_MAX_FRAME_DOMAINS = 512. + loc->reserved_2 |= (frm + 1); // save "new" value + + // Transform compiler-generated region location into the format + // that the tools more or less standardized on: + // "<func>$omp$parallel@[file:]<line>[:<col>]" + char *buff = NULL; + kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1); + buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func, + team_size, str_loc.file, str_loc.line, + str_loc.col); + + __itt_suppress_push(__itt_suppress_memory_errors); + __kmp_itt_region_domains[frm] = __itt_domain_create(buff); + __itt_suppress_pop(); + + __kmp_str_free(&buff); + if (barriers) { + if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) { + int frm = KMP_TEST_THEN_INC32( + &__kmp_barrier_domain_count); // get "old" value + if (frm >= KMP_MAX_FRAME_DOMAINS) { + KMP_TEST_THEN_DEC32( + &__kmp_barrier_domain_count); // revert the count + return; // loc->reserved_2 is still 0 + } + char *buff = NULL; + buff = __kmp_str_format("%s$omp$barrier@%s:%d", str_loc.func, + str_loc.file, str_loc.col); + __itt_suppress_push(__itt_suppress_memory_errors); + __kmp_itt_barrier_domains[frm] = __itt_domain_create(buff); + __itt_suppress_pop(); + __kmp_str_free(&buff); + // Save the barrier frame index to the high two bytes. + loc->reserved_2 |= (frm + 1) << 16; + } + } + __kmp_str_loc_free(&str_loc); + __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL); + } + } else { // Region domain exists for this location + // Check if team size was changed. Then create new region domain for this + // location + unsigned int frm = (loc->reserved_2 & 0x0000FFFF) - 1; + if ((frm < KMP_MAX_FRAME_DOMAINS) && + (__kmp_itt_region_team_size[frm] != team_size)) { + char *buff = NULL; + kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1); + buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func, + team_size, str_loc.file, str_loc.line, + str_loc.col); + + __itt_suppress_push(__itt_suppress_memory_errors); + __kmp_itt_region_domains[frm] = __itt_domain_create(buff); + __itt_suppress_pop(); + + __kmp_str_free(&buff); + __kmp_str_loc_free(&str_loc); + __kmp_itt_region_team_size[frm] = team_size; + __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL); + } else { // Team size was not changed. Use existing domain. + __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL); + } + } + KMP_ITT_DEBUG_LOCK(); + KMP_ITT_DEBUG_PRINT("[frm beg] gtid=%d, idx=%x, loc:%p\n", gtid, + loc->reserved_2, loc); + } +#endif +} // __kmp_itt_region_forking + +// ----------------------------------------------------------------------------- +LINKAGE void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin, + __itt_timestamp end, int imbalance, + ident_t *loc, int team_size, int region) { +#if USE_ITT_NOTIFY + if (region) { + kmp_team_t *team = __kmp_team_from_gtid(gtid); + int serialized = (region == 2 ? 1 : 0); + if (team->t.t_active_level + serialized > 1) { + // The frame notifications are only supported for the outermost teams. + return; + } + // Check region domain has not been created before. It's index is saved in + // the low two bytes. + if ((loc->reserved_2 & 0x0000FFFF) == 0) { + if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) { + int frm = + KMP_TEST_THEN_INC32(&__kmp_region_domain_count); // get "old" value + if (frm >= KMP_MAX_FRAME_DOMAINS) { + KMP_TEST_THEN_DEC32(&__kmp_region_domain_count); // revert the count + return; // loc->reserved_2 is still 0 + } + + // We need to save indexes for both region and barrier frames. We'll use + // loc->reserved_2 field but put region index to the low two bytes and + // barrier indexes to the high two bytes. It is OK because + // KMP_MAX_FRAME_DOMAINS = 512. + loc->reserved_2 |= (frm + 1); // save "new" value + + // Transform compiler-generated region location into the format + // that the tools more or less standardized on: + // "<func>$omp$parallel:team_size@[file:]<line>[:<col>]" + char *buff = NULL; + kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1); + buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func, + team_size, str_loc.file, str_loc.line, + str_loc.col); + + __itt_suppress_push(__itt_suppress_memory_errors); + __kmp_itt_region_domains[frm] = __itt_domain_create(buff); + __itt_suppress_pop(); + + __kmp_str_free(&buff); + __kmp_str_loc_free(&str_loc); + __kmp_itt_region_team_size[frm] = team_size; + __itt_frame_submit_v3(__kmp_itt_region_domains[frm], NULL, begin, end); + } + } else { // Region domain exists for this location + // Check if team size was changed. Then create new region domain for this + // location + unsigned int frm = (loc->reserved_2 & 0x0000FFFF) - 1; + if ((frm < KMP_MAX_FRAME_DOMAINS) && + (__kmp_itt_region_team_size[frm] != team_size)) { + char *buff = NULL; + kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1); + buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func, + team_size, str_loc.file, str_loc.line, + str_loc.col); + + __itt_suppress_push(__itt_suppress_memory_errors); + __kmp_itt_region_domains[frm] = __itt_domain_create(buff); + __itt_suppress_pop(); + + __kmp_str_free(&buff); + __kmp_str_loc_free(&str_loc); + __kmp_itt_region_team_size[frm] = team_size; + __itt_frame_submit_v3(__kmp_itt_region_domains[frm], NULL, begin, end); + } else { // Team size was not changed. Use existing domain. + __itt_frame_submit_v3(__kmp_itt_region_domains[frm], NULL, begin, end); + } + } + KMP_ITT_DEBUG_LOCK(); + KMP_ITT_DEBUG_PRINT( + "[reg sub] gtid=%d, idx=%x, region:%d, loc:%p, beg:%llu, end:%llu\n", + gtid, loc->reserved_2, region, loc, begin, end); + return; + } else { // called for barrier reporting + if (loc) { + if ((loc->reserved_2 & 0xFFFF0000) == 0) { + if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) { + int frm = KMP_TEST_THEN_INC32( + &__kmp_barrier_domain_count); // get "old" value + if (frm >= KMP_MAX_FRAME_DOMAINS) { + KMP_TEST_THEN_DEC32( + &__kmp_barrier_domain_count); // revert the count + return; // loc->reserved_2 is still 0 + } + // Save the barrier frame index to the high two bytes. + loc->reserved_2 |= (frm + 1) << 16; // save "new" value + + // Transform compiler-generated region location into the format + // that the tools more or less standardized on: + // "<func>$omp$frame@[file:]<line>[:<col>]" + kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1); + if (imbalance) { + char *buff_imb = NULL; + buff_imb = __kmp_str_format("%s$omp$barrier-imbalance:%d@%s:%d", + str_loc.func, team_size, str_loc.file, + str_loc.col); + __itt_suppress_push(__itt_suppress_memory_errors); + __kmp_itt_imbalance_domains[frm] = __itt_domain_create(buff_imb); + __itt_suppress_pop(); + __itt_frame_submit_v3(__kmp_itt_imbalance_domains[frm], NULL, begin, + end); + __kmp_str_free(&buff_imb); + } else { + char *buff = NULL; + buff = __kmp_str_format("%s$omp$barrier@%s:%d", str_loc.func, + str_loc.file, str_loc.col); + __itt_suppress_push(__itt_suppress_memory_errors); + __kmp_itt_barrier_domains[frm] = __itt_domain_create(buff); + __itt_suppress_pop(); + __itt_frame_submit_v3(__kmp_itt_barrier_domains[frm], NULL, begin, + end); + __kmp_str_free(&buff); + } + __kmp_str_loc_free(&str_loc); + } + } else { // if it is not 0 then it should be <= KMP_MAX_FRAME_DOMAINS + if (imbalance) { + __itt_frame_submit_v3( + __kmp_itt_imbalance_domains[(loc->reserved_2 >> 16) - 1], NULL, + begin, end); + } else { + __itt_frame_submit_v3( + __kmp_itt_barrier_domains[(loc->reserved_2 >> 16) - 1], NULL, + begin, end); + } + } + KMP_ITT_DEBUG_LOCK(); + KMP_ITT_DEBUG_PRINT( + "[frm sub] gtid=%d, idx=%x, loc:%p, beg:%llu, end:%llu\n", gtid, + loc->reserved_2, loc, begin, end); + } + } +#endif +} // __kmp_itt_frame_submit + +// ----------------------------------------------------------------------------- +LINKAGE void __kmp_itt_metadata_imbalance(int gtid, kmp_uint64 begin, + kmp_uint64 end, kmp_uint64 imbalance, + kmp_uint64 reduction) { +#if USE_ITT_NOTIFY + if (metadata_domain == NULL) { + __kmp_acquire_bootstrap_lock(&metadata_lock); + if (metadata_domain == NULL) { + __itt_suppress_push(__itt_suppress_memory_errors); + metadata_domain = __itt_domain_create("OMP Metadata"); + string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance"); + string_handle_loop = __itt_string_handle_create("omp_metadata_loop"); + string_handle_sngl = __itt_string_handle_create("omp_metadata_single"); + __itt_suppress_pop(); + } + __kmp_release_bootstrap_lock(&metadata_lock); + } + + kmp_uint64 imbalance_data[4]; + imbalance_data[0] = begin; + imbalance_data[1] = end; + imbalance_data[2] = imbalance; + imbalance_data[3] = reduction; + + __itt_metadata_add(metadata_domain, __itt_null, string_handle_imbl, + __itt_metadata_u64, 4, imbalance_data); +#endif +} // __kmp_itt_metadata_imbalance + +// ----------------------------------------------------------------------------- +LINKAGE void __kmp_itt_metadata_loop(ident_t *loc, kmp_uint64 sched_type, + kmp_uint64 iterations, kmp_uint64 chunk) { +#if USE_ITT_NOTIFY + if (metadata_domain == NULL) { + __kmp_acquire_bootstrap_lock(&metadata_lock); + if (metadata_domain == NULL) { + __itt_suppress_push(__itt_suppress_memory_errors); + metadata_domain = __itt_domain_create("OMP Metadata"); + string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance"); + string_handle_loop = __itt_string_handle_create("omp_metadata_loop"); + string_handle_sngl = __itt_string_handle_create("omp_metadata_single"); + __itt_suppress_pop(); + } + __kmp_release_bootstrap_lock(&metadata_lock); + } + + // Parse line and column from psource string: ";file;func;line;col;;" + char *s_line; + char *s_col; + KMP_DEBUG_ASSERT(loc->psource); +#ifdef __cplusplus + s_line = strchr(CCAST(char *, loc->psource), ';'); +#else + s_line = strchr(loc->psource, ';'); +#endif + KMP_DEBUG_ASSERT(s_line); + s_line = strchr(s_line + 1, ';'); // 2-nd semicolon + KMP_DEBUG_ASSERT(s_line); + s_line = strchr(s_line + 1, ';'); // 3-rd semicolon + KMP_DEBUG_ASSERT(s_line); + s_col = strchr(s_line + 1, ';'); // 4-th semicolon + KMP_DEBUG_ASSERT(s_col); + + kmp_uint64 loop_data[5]; + loop_data[0] = atoi(s_line + 1); // read line + loop_data[1] = atoi(s_col + 1); // read column + loop_data[2] = sched_type; + loop_data[3] = iterations; + loop_data[4] = chunk; + + __itt_metadata_add(metadata_domain, __itt_null, string_handle_loop, + __itt_metadata_u64, 5, loop_data); +#endif +} // __kmp_itt_metadata_loop + +// ----------------------------------------------------------------------------- +LINKAGE void __kmp_itt_metadata_single(ident_t *loc) { +#if USE_ITT_NOTIFY + if (metadata_domain == NULL) { + __kmp_acquire_bootstrap_lock(&metadata_lock); + if (metadata_domain == NULL) { + __itt_suppress_push(__itt_suppress_memory_errors); + metadata_domain = __itt_domain_create("OMP Metadata"); + string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance"); + string_handle_loop = __itt_string_handle_create("omp_metadata_loop"); + string_handle_sngl = __itt_string_handle_create("omp_metadata_single"); + __itt_suppress_pop(); + } + __kmp_release_bootstrap_lock(&metadata_lock); + } + + kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1); + kmp_uint64 single_data[2]; + single_data[0] = str_loc.line; + single_data[1] = str_loc.col; + + __kmp_str_loc_free(&str_loc); + + __itt_metadata_add(metadata_domain, __itt_null, string_handle_sngl, + __itt_metadata_u64, 2, single_data); +#endif +} // __kmp_itt_metadata_single + +// ----------------------------------------------------------------------------- +LINKAGE void __kmp_itt_region_starting(int gtid) { +#if USE_ITT_NOTIFY +#endif +} // __kmp_itt_region_starting + +// ----------------------------------------------------------------------------- +LINKAGE void __kmp_itt_region_finished(int gtid) { +#if USE_ITT_NOTIFY +#endif +} // __kmp_itt_region_finished + +// ---------------------------------------------------------------------------- +LINKAGE void __kmp_itt_region_joined(int gtid) { +#if USE_ITT_NOTIFY + kmp_team_t *team = __kmp_team_from_gtid(gtid); + if (team->t.t_active_level > 1) { + // The frame notifications are only supported for the outermost teams. + return; + } + ident_t *loc = __kmp_thread_from_gtid(gtid)->th.th_ident; + if (loc && loc->reserved_2) { + unsigned int frm = (loc->reserved_2 & 0x0000FFFF) - 1; + if (frm < KMP_MAX_FRAME_DOMAINS) { + KMP_ITT_DEBUG_LOCK(); + __itt_frame_end_v3(__kmp_itt_region_domains[frm], NULL); + KMP_ITT_DEBUG_PRINT("[frm end] gtid=%d, idx=%x, loc:%p\n", gtid, + loc->reserved_2, loc); + } + } +#endif +} // __kmp_itt_region_joined + +/* Barriers reporting. + + A barrier consists of two phases: + 1. Gather -- master waits for arriving of all the worker threads; each + worker thread registers arrival and goes further. + 2. Release -- each worker threads waits until master lets it go; master lets + worker threads go. + + Function should be called by each thread: + * __kmp_itt_barrier_starting() -- before arriving to the gather phase. + * __kmp_itt_barrier_middle() -- between gather and release phases. + * __kmp_itt_barrier_finished() -- after release phase. + + Note: Call __kmp_itt_barrier_object() before call to + __kmp_itt_barrier_starting() and save result in local variable. + __kmp_itt_barrier_object(), being called too late (e. g. after gather phase) + would return itt sync object for the next barrier! + + ITT need an address (void *) to be specified as a sync object. OpenMP RTL + does not have barrier object or barrier data structure. Barrier is just a + counter in team and thread structures. We could use an address of team + structure as an barrier sync object, but ITT wants different objects for + different barriers (even whithin the same team). So let us use team address + as barrier sync object for the first barrier, then increase it by one for the + next barrier, and so on (but wrap it not to use addresses outside of team + structure). */ + +void *__kmp_itt_barrier_object(int gtid, int bt, int set_name, + int delta // 0 (current barrier) is default + // value; specify -1 to get previous + // barrier. + ) { + void *object = NULL; +#if USE_ITT_NOTIFY + kmp_info_t *thr = __kmp_thread_from_gtid(gtid); + kmp_team_t *team = thr->th.th_team; + + // NOTE: If the function is called from __kmp_fork_barrier, team pointer can + // be NULL. This "if" helps to avoid crash. However, this is not complete + // solution, and reporting fork/join barriers to ITT should be revisited. + + if (team != NULL) { + // Master thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time. + // Divide b_arrived by KMP_BARRIER_STATE_BUMP to get plain barrier counter. + kmp_uint64 counter = + team->t.t_bar[bt].b_arrived / KMP_BARRIER_STATE_BUMP + delta; + // Now form the barrier id. Encode barrier type (bt) in barrier id too, so + // barriers of different types do not have the same ids. + KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= bs_last_barrier); + // This conditon is a must (we would have zero divide otherwise). + KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= 2 * bs_last_barrier); + // More strong condition: make sure we have room at least for for two + // differtent ids (for each barrier type). + object = reinterpret_cast<void *>( + kmp_uintptr_t(team) + + counter % (sizeof(kmp_team_t) / bs_last_barrier) * bs_last_barrier + + bt); + KMP_ITT_DEBUG_LOCK(); + KMP_ITT_DEBUG_PRINT("[bar obj] type=%d, counter=%lld, object=%p\n", bt, + counter, object); + + if (set_name) { + ident_t const *loc = NULL; + char const *src = NULL; + char const *type = "OMP Barrier"; + switch (bt) { + case bs_plain_barrier: { + // For plain barrier compiler calls __kmpc_barrier() function, which + // saves location in thr->th.th_ident. + loc = thr->th.th_ident; + // Get the barrier type from flags provided by compiler. + kmp_int32 expl = 0; + kmp_uint32 impl = 0; + if (loc != NULL) { + src = loc->psource; + expl = (loc->flags & KMP_IDENT_BARRIER_EXPL) != 0; + impl = (loc->flags & KMP_IDENT_BARRIER_IMPL) != 0; + } + if (impl) { + switch (loc->flags & KMP_IDENT_BARRIER_IMPL_MASK) { + case KMP_IDENT_BARRIER_IMPL_FOR: { + type = "OMP For Barrier"; + } break; + case KMP_IDENT_BARRIER_IMPL_SECTIONS: { + type = "OMP Sections Barrier"; + } break; + case KMP_IDENT_BARRIER_IMPL_SINGLE: { + type = "OMP Single Barrier"; + } break; + case KMP_IDENT_BARRIER_IMPL_WORKSHARE: { + type = "OMP Workshare Barrier"; + } break; + default: { + type = "OMP Implicit Barrier"; + KMP_DEBUG_ASSERT(0); + } + } + } else if (expl) { + type = "OMP Explicit Barrier"; + } + } break; + case bs_forkjoin_barrier: { + // In case of fork/join barrier we can read thr->th.th_ident, because it + // contains location of last passed construct (while join barrier is not + // such one). Use th_ident of master thread instead -- __kmp_join_call() + // called by the master thread saves location. + // + // AC: cannot read from master because __kmp_join_call may be not called + // yet, so we read the location from team. This is the same location. + // And team is valid at the enter to join barrier where this happens. + loc = team->t.t_ident; + if (loc != NULL) { + src = loc->psource; + } + type = "OMP Join Barrier"; + } break; + } + KMP_ITT_DEBUG_LOCK(); + __itt_sync_create(object, type, src, __itt_attr_barrier); + KMP_ITT_DEBUG_PRINT( + "[bar sta] scre( %p, \"%s\", \"%s\", __itt_attr_barrier )\n", object, + type, src); + } + } +#endif + return object; +} // __kmp_itt_barrier_object + +// ----------------------------------------------------------------------------- +void __kmp_itt_barrier_starting(int gtid, void *object) { +#if USE_ITT_NOTIFY + if (!KMP_MASTER_GTID(gtid)) { + KMP_ITT_DEBUG_LOCK(); + __itt_sync_releasing(object); + KMP_ITT_DEBUG_PRINT("[bar sta] srel( %p )\n", object); + } + KMP_ITT_DEBUG_LOCK(); + __itt_sync_prepare(object); + KMP_ITT_DEBUG_PRINT("[bar sta] spre( %p )\n", object); +#endif +} // __kmp_itt_barrier_starting + +// ----------------------------------------------------------------------------- +void __kmp_itt_barrier_middle(int gtid, void *object) { +#if USE_ITT_NOTIFY + if (KMP_MASTER_GTID(gtid)) { + KMP_ITT_DEBUG_LOCK(); + __itt_sync_acquired(object); + KMP_ITT_DEBUG_PRINT("[bar mid] sacq( %p )\n", object); + KMP_ITT_DEBUG_LOCK(); + __itt_sync_releasing(object); + KMP_ITT_DEBUG_PRINT("[bar mid] srel( %p )\n", object); + } else { + } +#endif +} // __kmp_itt_barrier_middle + +// ----------------------------------------------------------------------------- +void __kmp_itt_barrier_finished(int gtid, void *object) { +#if USE_ITT_NOTIFY + if (KMP_MASTER_GTID(gtid)) { + } else { + KMP_ITT_DEBUG_LOCK(); + __itt_sync_acquired(object); + KMP_ITT_DEBUG_PRINT("[bar end] sacq( %p )\n", object); + } +#endif +} // __kmp_itt_barrier_finished + +/* Taskwait reporting. + ITT need an address (void *) to be specified as a sync object. OpenMP RTL + does not have taskwait structure, so we need to construct something. */ + +void *__kmp_itt_taskwait_object(int gtid) { + void *object = NULL; +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr) { + kmp_info_t *thread = __kmp_thread_from_gtid(gtid); + kmp_taskdata_t *taskdata = thread->th.th_current_task; + object = reinterpret_cast<void *>(kmp_uintptr_t(taskdata) + + taskdata->td_taskwait_counter % + sizeof(kmp_taskdata_t)); + } +#endif + return object; +} // __kmp_itt_taskwait_object + +void __kmp_itt_taskwait_starting(int gtid, void *object) { +#if USE_ITT_NOTIFY + kmp_info_t *thread = __kmp_thread_from_gtid(gtid); + kmp_taskdata_t *taskdata = thread->th.th_current_task; + ident_t const *loc = taskdata->td_taskwait_ident; + char const *src = (loc == NULL ? NULL : loc->psource); + KMP_ITT_DEBUG_LOCK(); + __itt_sync_create(object, "OMP Taskwait", src, 0); + KMP_ITT_DEBUG_PRINT("[twa sta] scre( %p, \"OMP Taskwait\", \"%s\", 0 )\n", + object, src); + KMP_ITT_DEBUG_LOCK(); + __itt_sync_prepare(object); + KMP_ITT_DEBUG_PRINT("[twa sta] spre( %p )\n", object); +#endif +} // __kmp_itt_taskwait_starting + +void __kmp_itt_taskwait_finished(int gtid, void *object) { +#if USE_ITT_NOTIFY + KMP_ITT_DEBUG_LOCK(); + __itt_sync_acquired(object); + KMP_ITT_DEBUG_PRINT("[twa end] sacq( %p )\n", object); + KMP_ITT_DEBUG_LOCK(); + __itt_sync_destroy(object); + KMP_ITT_DEBUG_PRINT("[twa end] sdes( %p )\n", object); +#endif +} // __kmp_itt_taskwait_finished + +/* Task reporting. + Only those tasks are reported which are executed by a thread spinning at + barrier (or taskwait). Synch object passed to the function must be barrier of + taskwait the threads waiting at. */ + +void __kmp_itt_task_starting( + void *object // ITT sync object: barrier or taskwait. + ) { +#if USE_ITT_NOTIFY + if (object != NULL) { + KMP_ITT_DEBUG_LOCK(); + __itt_sync_cancel(object); + KMP_ITT_DEBUG_PRINT("[tsk sta] scan( %p )\n", object); + } +#endif +} // __kmp_itt_task_starting + +// ----------------------------------------------------------------------------- +void __kmp_itt_task_finished( + void *object // ITT sync object: barrier or taskwait. + ) { +#if USE_ITT_NOTIFY + KMP_ITT_DEBUG_LOCK(); + __itt_sync_prepare(object); + KMP_ITT_DEBUG_PRINT("[tsk end] spre( %p )\n", object); +#endif +} // __kmp_itt_task_finished + +/* Lock reporting. + * __kmp_itt_lock_creating( lock ) should be called *before* the first lock + operation (set/unset). It is not a real event shown to the user but just + setting a name for synchronization object. `lock' is an address of sync + object, the same address should be used in all subsequent calls. + * __kmp_itt_lock_acquiring() should be called before setting the lock. + * __kmp_itt_lock_acquired() should be called after setting the lock. + * __kmp_itt_lock_realeasing() should be called before unsetting the lock. + * __kmp_itt_lock_cancelled() should be called after thread cancelled waiting + for the lock. + * __kmp_itt_lock_destroyed( lock ) should be called after the last lock + operation. After __kmp_itt_lock_destroyed() all the references to the same + address will be considered as another sync object, not related with the + original one. */ + +#if KMP_USE_DYNAMIC_LOCK +// Takes location information directly +__kmp_inline void ___kmp_itt_lock_init(kmp_user_lock_p lock, char const *type, + const ident_t *loc) { +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr) { + char const *src = (loc == NULL ? NULL : loc->psource); + KMP_ITT_DEBUG_LOCK(); + __itt_sync_create(lock, type, src, 0); + KMP_ITT_DEBUG_PRINT("[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type, + src); + } +#endif +} +#else // KMP_USE_DYNAMIC_LOCK +// Internal guts -- common code for locks and critical sections, do not call +// directly. +__kmp_inline void ___kmp_itt_lock_init(kmp_user_lock_p lock, char const *type) { +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr) { + ident_t const *loc = NULL; + if (__kmp_get_user_lock_location_ != NULL) + loc = __kmp_get_user_lock_location_((lock)); + char const *src = (loc == NULL ? NULL : loc->psource); + KMP_ITT_DEBUG_LOCK(); + __itt_sync_create(lock, type, src, 0); + KMP_ITT_DEBUG_PRINT("[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type, + src); + } +#endif +} // ___kmp_itt_lock_init +#endif // KMP_USE_DYNAMIC_LOCK + +// Internal guts -- common code for locks and critical sections, do not call +// directly. +__kmp_inline void ___kmp_itt_lock_fini(kmp_user_lock_p lock, char const *type) { +#if USE_ITT_NOTIFY + KMP_ITT_DEBUG_LOCK(); + __itt_sync_destroy(lock); + KMP_ITT_DEBUG_PRINT("[lck dst] sdes( %p )\n", lock); +#endif +} // ___kmp_itt_lock_fini + +// ----------------------------------------------------------------------------- +#if KMP_USE_DYNAMIC_LOCK +void __kmp_itt_lock_creating(kmp_user_lock_p lock, const ident_t *loc) { + ___kmp_itt_lock_init(lock, "OMP Lock", loc); +} +#else +void __kmp_itt_lock_creating(kmp_user_lock_p lock) { + ___kmp_itt_lock_init(lock, "OMP Lock"); +} // __kmp_itt_lock_creating +#endif + +void __kmp_itt_lock_acquiring(kmp_user_lock_p lock) { +#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY + // postpone lock object access + if (__itt_sync_prepare_ptr) { + if (KMP_EXTRACT_D_TAG(lock) == 0) { + kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); + __itt_sync_prepare(ilk->lock); + } else { + __itt_sync_prepare(lock); + } + } +#else + __itt_sync_prepare(lock); +#endif +} // __kmp_itt_lock_acquiring + +void __kmp_itt_lock_acquired(kmp_user_lock_p lock) { +#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY + // postpone lock object access + if (__itt_sync_acquired_ptr) { + if (KMP_EXTRACT_D_TAG(lock) == 0) { + kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); + __itt_sync_acquired(ilk->lock); + } else { + __itt_sync_acquired(lock); + } + } +#else + __itt_sync_acquired(lock); +#endif +} // __kmp_itt_lock_acquired + +void __kmp_itt_lock_releasing(kmp_user_lock_p lock) { +#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY + if (__itt_sync_releasing_ptr) { + if (KMP_EXTRACT_D_TAG(lock) == 0) { + kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); + __itt_sync_releasing(ilk->lock); + } else { + __itt_sync_releasing(lock); + } + } +#else + __itt_sync_releasing(lock); +#endif +} // __kmp_itt_lock_releasing + +void __kmp_itt_lock_cancelled(kmp_user_lock_p lock) { +#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY + if (__itt_sync_cancel_ptr) { + if (KMP_EXTRACT_D_TAG(lock) == 0) { + kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); + __itt_sync_cancel(ilk->lock); + } else { + __itt_sync_cancel(lock); + } + } +#else + __itt_sync_cancel(lock); +#endif +} // __kmp_itt_lock_cancelled + +void __kmp_itt_lock_destroyed(kmp_user_lock_p lock) { + ___kmp_itt_lock_fini(lock, "OMP Lock"); +} // __kmp_itt_lock_destroyed + +/* Critical reporting. + Critical sections are treated exactly as locks (but have different object + type). */ +#if KMP_USE_DYNAMIC_LOCK +void __kmp_itt_critical_creating(kmp_user_lock_p lock, const ident_t *loc) { + ___kmp_itt_lock_init(lock, "OMP Critical", loc); +} +#else +void __kmp_itt_critical_creating(kmp_user_lock_p lock) { + ___kmp_itt_lock_init(lock, "OMP Critical"); +} // __kmp_itt_critical_creating +#endif + +void __kmp_itt_critical_acquiring(kmp_user_lock_p lock) { + __itt_sync_prepare(lock); +} // __kmp_itt_critical_acquiring + +void __kmp_itt_critical_acquired(kmp_user_lock_p lock) { + __itt_sync_acquired(lock); +} // __kmp_itt_critical_acquired + +void __kmp_itt_critical_releasing(kmp_user_lock_p lock) { + __itt_sync_releasing(lock); +} // __kmp_itt_critical_releasing + +void __kmp_itt_critical_destroyed(kmp_user_lock_p lock) { + ___kmp_itt_lock_fini(lock, "OMP Critical"); +} // __kmp_itt_critical_destroyed + +/* Single reporting. */ + +void __kmp_itt_single_start(int gtid) { +#if USE_ITT_NOTIFY + if (__itt_mark_create_ptr || KMP_ITT_DEBUG) { + kmp_info_t *thr = __kmp_thread_from_gtid((gtid)); + ident_t *loc = thr->th.th_ident; + char const *src = (loc == NULL ? NULL : loc->psource); + kmp_str_buf_t name; + __kmp_str_buf_init(&name); + __kmp_str_buf_print(&name, "OMP Single-%s", src); + KMP_ITT_DEBUG_LOCK(); + thr->th.th_itt_mark_single = __itt_mark_create(name.str); + KMP_ITT_DEBUG_PRINT("[sin sta] mcre( \"%s\") -> %d\n", name.str, + thr->th.th_itt_mark_single); + __kmp_str_buf_free(&name); + KMP_ITT_DEBUG_LOCK(); + __itt_mark(thr->th.th_itt_mark_single, NULL); + KMP_ITT_DEBUG_PRINT("[sin sta] mark( %d, NULL )\n", + thr->th.th_itt_mark_single); + } +#endif +} // __kmp_itt_single_start + +void __kmp_itt_single_end(int gtid) { +#if USE_ITT_NOTIFY + __itt_mark_type mark = __kmp_thread_from_gtid(gtid)->th.th_itt_mark_single; + KMP_ITT_DEBUG_LOCK(); + __itt_mark_off(mark); + KMP_ITT_DEBUG_PRINT("[sin end] moff( %d )\n", mark); +#endif +} // __kmp_itt_single_end + +/* Ordered reporting. + * __kmp_itt_ordered_init is called by each thread *before* first using sync + object. ITT team would like it to be called once, but it requires extra + synchronization. + * __kmp_itt_ordered_prep is called when thread is going to enter ordered + section (before synchronization). + * __kmp_itt_ordered_start is called just before entering user code (after + synchronization). + * __kmp_itt_ordered_end is called after returning from user code. + + Sync object is th->th.th_dispatch->th_dispatch_sh_current. + Events are not generated in case of serialized team. */ + +void __kmp_itt_ordered_init(int gtid) { +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr) { + kmp_info_t *thr = __kmp_thread_from_gtid(gtid); + ident_t const *loc = thr->th.th_ident; + char const *src = (loc == NULL ? NULL : loc->psource); + __itt_sync_create(thr->th.th_dispatch->th_dispatch_sh_current, + "OMP Ordered", src, 0); + } +#endif +} // __kmp_itt_ordered_init + +void __kmp_itt_ordered_prep(int gtid) { +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr) { + kmp_team_t *t = __kmp_team_from_gtid(gtid); + if (!t->t.t_serialized) { + kmp_info_t *th = __kmp_thread_from_gtid(gtid); + __itt_sync_prepare(th->th.th_dispatch->th_dispatch_sh_current); + } + } +#endif +} // __kmp_itt_ordered_prep + +void __kmp_itt_ordered_start(int gtid) { +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr) { + kmp_team_t *t = __kmp_team_from_gtid(gtid); + if (!t->t.t_serialized) { + kmp_info_t *th = __kmp_thread_from_gtid(gtid); + __itt_sync_acquired(th->th.th_dispatch->th_dispatch_sh_current); + } + } +#endif +} // __kmp_itt_ordered_start + +void __kmp_itt_ordered_end(int gtid) { +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr) { + kmp_team_t *t = __kmp_team_from_gtid(gtid); + if (!t->t.t_serialized) { + kmp_info_t *th = __kmp_thread_from_gtid(gtid); + __itt_sync_releasing(th->th.th_dispatch->th_dispatch_sh_current); + } + } +#endif +} // __kmp_itt_ordered_end + +/* Threads reporting. */ + +void __kmp_itt_thread_ignore() { + __itt_thr_ignore(); +} // __kmp_itt_thread_ignore + +void __kmp_itt_thread_name(int gtid) { +#if USE_ITT_NOTIFY + if (__itt_thr_name_set_ptr) { + kmp_str_buf_t name; + __kmp_str_buf_init(&name); + if (KMP_MASTER_GTID(gtid)) { + __kmp_str_buf_print(&name, "OMP Master Thread #%d", gtid); + } else { + __kmp_str_buf_print(&name, "OMP Worker Thread #%d", gtid); + } + KMP_ITT_DEBUG_LOCK(); + __itt_thr_name_set(name.str, name.used); + KMP_ITT_DEBUG_PRINT("[thr nam] name( \"%s\")\n", name.str); + __kmp_str_buf_free(&name); + } +#endif +} // __kmp_itt_thread_name + +/* System object reporting. + ITT catches operations with system sync objects (like Windows* OS on IA-32 + architecture API critical sections and events). We only need to specify + name ("OMP Scheduler") for the object to let ITT know it is an object used + by OpenMP RTL for internal purposes. */ + +void __kmp_itt_system_object_created(void *object, char const *name) { +#if USE_ITT_NOTIFY + KMP_ITT_DEBUG_LOCK(); + __itt_sync_create(object, "OMP Scheduler", name, 0); + KMP_ITT_DEBUG_PRINT("[sys obj] scre( %p, \"OMP Scheduler\", \"%s\", 0 )\n", + object, name); +#endif +} // __kmp_itt_system_object_created + +/* Stack stitching api. + Master calls "create" and put the stitching id into team structure. + Workers read the stitching id and call "enter" / "leave" api. + Master calls "destroy" at the end of the parallel region. */ + +__itt_caller __kmp_itt_stack_caller_create() { +#if USE_ITT_NOTIFY + if (!__itt_stack_caller_create_ptr) + return NULL; + KMP_ITT_DEBUG_LOCK(); + __itt_caller id = __itt_stack_caller_create(); + KMP_ITT_DEBUG_PRINT("[stk cre] %p\n", id); + return id; +#endif + return NULL; +} + +void __kmp_itt_stack_caller_destroy(__itt_caller id) { +#if USE_ITT_NOTIFY + if (__itt_stack_caller_destroy_ptr) { + KMP_ITT_DEBUG_LOCK(); + __itt_stack_caller_destroy(id); + KMP_ITT_DEBUG_PRINT("[stk des] %p\n", id); + } +#endif +} + +void __kmp_itt_stack_callee_enter(__itt_caller id) { +#if USE_ITT_NOTIFY + if (__itt_stack_callee_enter_ptr) { + KMP_ITT_DEBUG_LOCK(); + __itt_stack_callee_enter(id); + KMP_ITT_DEBUG_PRINT("[stk ent] %p\n", id); + } +#endif +} + +void __kmp_itt_stack_callee_leave(__itt_caller id) { +#if USE_ITT_NOTIFY + if (__itt_stack_callee_leave_ptr) { + KMP_ITT_DEBUG_LOCK(); + __itt_stack_callee_leave(id); + KMP_ITT_DEBUG_PRINT("[stk lea] %p\n", id); + } +#endif +} + +#endif /* USE_ITT_BUILD */ |