aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Peyton <jonathan.l.peyton@intel.com>2018-07-30 17:41:08 +0000
committerJonathan Peyton <jonathan.l.peyton@intel.com>2018-07-30 17:41:08 +0000
commit0d4ae7a6d19a538fdbd9510cb6109a52935efd16 (patch)
treed0914ff87ff86455ac8232d4a678326d9715f042
parentde6991eeb9d364f7efa2e0f1821e35f9c3dcdb48 (diff)
[OpenMP][Stats] Cleanup stats gathering code
1) Remove unnecessary data from list node structure 2) Remove timerPair in favor of pushing/popping explicitTimers. This way, nested timers will work properly. 3) Fix #pragma omp critical timers 4) Add histogram capability 5) Add KMP_STATS_FILE formatting capability 6) Have time partitioned into serial & parallel by introducing partitionedTimers::exchange(). This also counts the number of serial regions in the executable. 7) Fix up the timers around OMP loops so that scheduling overhead and work are both counted correctly. 8) Fix up the iterations statistics so they count the number of iterations the thread receives at each loop scheduling event 9) Change timers so there is only one RDTSC read per event change 10) Fix up the outdated comments for the timers Differential Revision: https://reviews.llvm.org/D49699 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@338276 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--runtime/src/kmp_csupport.cpp26
-rw-r--r--runtime/src/kmp_dispatch.cpp68
-rw-r--r--runtime/src/kmp_runtime.cpp30
-rw-r--r--runtime/src/kmp_sched.cpp27
-rw-r--r--runtime/src/kmp_stats.cpp347
-rw-r--r--runtime/src/kmp_stats.h403
-rw-r--r--runtime/src/kmp_stats_timing.cpp9
-rw-r--r--runtime/src/z_Linux_util.cpp2
8 files changed, 609 insertions, 303 deletions
diff --git a/runtime/src/kmp_csupport.cpp b/runtime/src/kmp_csupport.cpp
index da4530c..23cc15b 100644
--- a/runtime/src/kmp_csupport.cpp
+++ b/runtime/src/kmp_csupport.cpp
@@ -262,6 +262,14 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
int gtid = __kmp_entry_gtid();
#if (KMP_STATS_ENABLED)
+ // If we were in a serial region, then stop the serial timer, record
+ // the event, and start parallel region timer
+ stats_state_e previous_state = KMP_GET_THREAD_STATE();
+ if (previous_state == stats_state_e::SERIAL_REGION) {
+ KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
+ } else {
+ KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
+ }
int inParallel = __kmpc_in_parallel(loc);
if (inParallel) {
KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
@@ -318,6 +326,14 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
va_end(ap);
}
+
+#if KMP_STATS_ENABLED
+ if (previous_state == stats_state_e::SERIAL_REGION) {
+ KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+ } else {
+ KMP_POP_PARTITIONED_TIMER();
+ }
+#endif // KMP_STATS_ENABLED
}
#if OMP_40_ENABLED
@@ -1115,8 +1131,6 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
__kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
#else
KMP_COUNT_BLOCK(OMP_CRITICAL);
- KMP_TIME_PARTITIONED_BLOCK(
- OMP_critical_wait); /* Time spent waiting to enter the critical section */
#if OMPT_SUPPORT && OMPT_OPTIONAL
omp_state_t prev_state = omp_state_undefined;
ompt_thread_info_t ti;
@@ -1127,6 +1141,7 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
// TODO: add THR_OVHD_STATE
+ KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
KMP_CHECK_USER_LOCK_INIT();
if ((__kmp_user_lock_kind == lk_tas) &&
@@ -1193,8 +1208,9 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
}
}
#endif
+ KMP_POP_PARTITIONED_TIMER();
- KMP_START_EXPLICIT_TIMER(OMP_critical);
+ KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
#endif // KMP_USE_DYNAMIC_LOCK
}
@@ -1345,6 +1361,7 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
// Check if it is initialized.
+ KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
if (*lk == 0) {
kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
if (KMP_IS_D_LOCK(lckseq)) {
@@ -1422,6 +1439,7 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
#endif
KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
}
+ KMP_POP_PARTITIONED_TIMER();
#if USE_ITT_BUILD
__kmp_itt_critical_acquired(lck);
@@ -1753,6 +1771,7 @@ void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
Mark the end of a statically scheduled loop.
*/
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
+ KMP_POP_PARTITIONED_TIMER();
KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
#if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -1779,7 +1798,6 @@ void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
}
#endif
-
if (__kmp_env_consistency_check)
__kmp_pop_workshare(global_tid, ct_pdo, loc);
}
diff --git a/runtime/src/kmp_dispatch.cpp b/runtime/src/kmp_dispatch.cpp
index 1306c55..5545ff5 100644
--- a/runtime/src/kmp_dispatch.cpp
+++ b/runtime/src/kmp_dispatch.cpp
@@ -321,7 +321,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
ntc = (tc % chunk ? 1 : 0) + tc / chunk;
if (nproc > 1 && ntc >= nproc) {
- KMP_COUNT_BLOCK(OMP_FOR_static_steal);
+ KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
T id = tid;
T small_chunk, extras;
@@ -770,6 +770,15 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
active = !team->t.t_serialized;
th->th.th_ident = loc;
+ // Any half-decent optimizer will remove this test when the blocks are empty
+ // since the macros expand to nothing
+ // when statistics are disabled.
+ if (schedule == __kmp_static) {
+ KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+ } else {
+ KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
+ }
+
#if KMP_USE_HIER_SCHED
// Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
// Hierarchical scheduling does not work with ordered, so if ordered is
@@ -846,17 +855,6 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
}
}
- // Any half-decent optimizer will remove this test when the blocks are empty
- // since the macros expand to nothing
- // when statistics are disabled.
- if (schedule == __kmp_static) {
- KMP_COUNT_BLOCK(OMP_FOR_static);
- KMP_COUNT_VALUE(FOR_static_iterations, pr->u.p.tc);
- } else {
- KMP_COUNT_BLOCK(OMP_FOR_dynamic);
- KMP_COUNT_VALUE(FOR_dynamic_iterations, pr->u.p.tc);
- }
-
if (active) {
/* The name of this buffer should be my_buffer_index when it's free to use
* it */
@@ -962,6 +960,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
&(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
}
#endif
+ KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
}
/* For ordered loops, either __kmp_dispatch_finish() should be called after
@@ -1229,11 +1228,11 @@ int __kmp_dispatch_next_algorithm(int gtid,
// by 1
if (remaining > 3) {
// steal 1/4 of remaining
- KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
+ KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
init = (victim->u.p.ub -= (remaining >> 2));
} else {
// steal 1 chunk of 2 or 3 remaining
- KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
+ KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
init = (victim->u.p.ub -= 1);
}
__kmp_release_lock(lck, gtid);
@@ -1333,7 +1332,8 @@ int __kmp_dispatch_next_algorithm(int gtid,
*VOLATILE_CAST(kmp_int64 *) & vold.b,
*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
// stealing succedded
- KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub);
+ KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
+ vold.p.ub - vnew.p.ub);
status = 1;
while_index = 0;
// now update own count and ub
@@ -1361,7 +1361,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
init *= chunk;
limit = chunk + init - 1;
incr = pr->u.p.st;
- KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
+ KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
KMP_DEBUG_ASSERT(init <= trip);
if ((last = (limit >= trip)) != 0)
@@ -1823,6 +1823,38 @@ int __kmp_dispatch_next_algorithm(int gtid,
#define OMPT_LOOP_END // no-op
#endif
+#if KMP_STATS_ENABLED
+#define KMP_STATS_LOOP_END \
+ { \
+ kmp_int64 u, l, t, i; \
+ l = (kmp_int64)(*p_lb); \
+ u = (kmp_int64)(*p_ub); \
+ i = (kmp_int64)(pr->u.p.st); \
+ if (status == 0) { \
+ t = 0; \
+ KMP_POP_PARTITIONED_TIMER(); \
+ } else if (i == 1) { \
+ if (u >= l) \
+ t = u - l + 1; \
+ else \
+ t = 0; \
+ } else if (i < 0) { \
+ if (l >= u) \
+ t = (l - u) / (-i) + 1; \
+ else \
+ t = 0; \
+ } else { \
+ if (u >= l) \
+ t = (u - l) / i + 1; \
+ else \
+ t = 0; \
+ } \
+ KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
+ }
+#else
+#define KMP_STATS_LOOP_END /* Nothing */
+#endif
+
template <typename T>
static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
T *p_lb, T *p_ub,
@@ -1840,7 +1872,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
// even if the actual runtme schedule is static. (Which points out a
// disadavantage of schedule(runtime): even when static scheduling is used it
// costs more than a compile time choice to use static scheduling would.)
- KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
+ KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
int status;
dispatch_private_info_template<T> *pr;
@@ -1964,6 +1996,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
SSC_MARK_DISPATCH_NEXT();
#endif
OMPT_LOOP_END;
+ KMP_STATS_LOOP_END;
return status;
} else {
kmp_int32 last = 0;
@@ -2081,6 +2114,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
SSC_MARK_DISPATCH_NEXT();
#endif
OMPT_LOOP_END;
+ KMP_STATS_LOOP_END;
return status;
}
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index a16d2fd..bf0f9dc 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -1644,12 +1644,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
- {
- KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
- KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
- if (!parent_team->t.t_invoke(gtid)) {
- KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
- }
+ if (!parent_team->t.t_invoke(gtid)) {
+ KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
}
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
@@ -1839,11 +1835,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
// because initial code in teams should have level=0
team->t.t_level--;
// AC: call special invoker for outer "parallel" of teams construct
- {
- KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
- KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
- invoker(gtid);
- }
+ invoker(gtid);
} else {
#endif /* OMP_40_ENABLED */
argv = args;
@@ -2256,12 +2248,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
team->t.t_id, team->t.t_pkfn));
} // END of timer KMP_fork_call block
- {
- KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
- KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
- if (!team->t.t_invoke(gtid)) {
- KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
- }
+ if (!team->t.t_invoke(gtid)) {
+ KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
}
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
@@ -3700,7 +3688,7 @@ int __kmp_register_root(int initial_thread) {
#if KMP_STATS_ENABLED
// Initialize stats as soon as possible (right after gtid assignment).
__kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
- KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
+ __kmp_stats_thread_ptr->startLife();
KMP_SET_THREAD_STATE(SERIAL_REGION);
KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
#endif
@@ -5639,11 +5627,7 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
}
#endif
- {
- KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
- KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
- rc = (*pteam)->t.t_invoke(gtid);
- }
+ rc = (*pteam)->t.t_invoke(gtid);
KMP_ASSERT(rc);
KMP_MB();
diff --git a/runtime/src/kmp_sched.cpp b/runtime/src/kmp_sched.cpp
index 3700ee0..d402db1 100644
--- a/runtime/src/kmp_sched.cpp
+++ b/runtime/src/kmp_sched.cpp
@@ -51,8 +51,9 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
void *codeptr
#endif
) {
- KMP_COUNT_BLOCK(OMP_FOR_static);
- KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling);
+ KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+ KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
+ KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
typedef typename traits_t<T>::unsigned_t UT;
typedef typename traits_t<T>::signed_t ST;
@@ -151,7 +152,6 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
&(task_info->task_data), 0, codeptr);
}
#endif
- KMP_COUNT_VALUE(FOR_static_iterations, 0);
return;
}
@@ -254,7 +254,6 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
loc);
}
}
- KMP_COUNT_VALUE(FOR_static_iterations, trip_count);
/* compute remaining parameters */
switch (schedtype) {
@@ -390,6 +389,26 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
}
#endif
+#if KMP_STATS_ENABLED
+ {
+ kmp_int64 t;
+ kmp_int64 u = (kmp_int64)(*pupper);
+ kmp_int64 l = (kmp_int64)(*plower);
+ kmp_int64 i = (kmp_int64)incr;
+ /* compute trip count */
+ if (i == 1) {
+ t = u - l + 1;
+ } else if (i == -1) {
+ t = l - u + 1;
+ } else if (i > 0) {
+ t = (u - l) / i + 1;
+ } else {
+ t = (l - u) / (-i) + 1;
+ }
+ KMP_COUNT_VALUE(OMP_loop_static_iterations, t);
+ KMP_POP_PARTITIONED_TIMER();
+ }
+#endif
return;
}
diff --git a/runtime/src/kmp_stats.cpp b/runtime/src/kmp_stats.cpp
index d75695f..2c0eabe 100644
--- a/runtime/src/kmp_stats.cpp
+++ b/runtime/src/kmp_stats.cpp
@@ -21,6 +21,7 @@
#include <iomanip>
#include <sstream>
#include <stdlib.h> // for atexit
+#include <cmath>
#define STRINGIZE2(x) #x
#define STRINGIZE(x) STRINGIZE2(x)
@@ -67,9 +68,18 @@ static uint32_t statsPrinted = 0;
// output interface
static kmp_stats_output_module *__kmp_stats_global_output = NULL;
+double logHistogram::binMax[] = {
+ 1.e1l, 1.e2l, 1.e3l, 1.e4l, 1.e5l, 1.e6l, 1.e7l, 1.e8l,
+ 1.e9l, 1.e10l, 1.e11l, 1.e12l, 1.e13l, 1.e14l, 1.e15l, 1.e16l,
+ 1.e17l, 1.e18l, 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l,
+ 1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l};
+
/* ************* statistic member functions ************* */
void statistic::addSample(double sample) {
+ sample -= offset;
+ KMP_DEBUG_ASSERT(std::isfinite(sample));
+
double delta = sample - meanVal;
sampleCount = sampleCount + 1;
@@ -78,9 +88,14 @@ void statistic::addSample(double sample) {
minVal = std::min(minVal, sample);
maxVal = std::max(maxVal, sample);
+ if (collectingHist)
+ hist.addSample(sample);
}
statistic &statistic::operator+=(const statistic &other) {
+ if (other.sampleCount == 0)
+ return *this;
+
if (sampleCount == 0) {
*this = other;
return *this;
@@ -104,6 +119,8 @@ statistic &statistic::operator+=(const statistic &other) {
minVal = std::min(minVal, other.minVal);
maxVal = std::max(maxVal, other.maxVal);
sampleCount = newSampleCount;
+ if (collectingHist)
+ hist += other.hist;
return *this;
}
@@ -138,10 +155,89 @@ std::string statistic::format(char unit, bool total) const {
return result;
}
+/* ************* histogram member functions ************* */
+
+// Lowest bin that has anything in it
+int logHistogram::minBin() const {
+ for (int i = 0; i < numBins; i++) {
+ if (bins[i].count != 0)
+ return i - logOffset;
+ }
+ return -logOffset;
+}
+
+// Highest bin that has anything in it
+int logHistogram::maxBin() const {
+ for (int i = numBins - 1; i >= 0; i--) {
+ if (bins[i].count != 0)
+ return i - logOffset;
+ }
+ return -logOffset;
+}
+
+// Which bin does this sample belong in ?
+uint32_t logHistogram::findBin(double sample) {
+ double v = std::fabs(sample);
+ // Simply loop up looking which bin to put it in.
+ // According to a micro-architect this is likely to be faster than a binary
+ // search, since
+ // it will only have one branch mis-predict
+ for (int b = 0; b < numBins; b++)
+ if (binMax[b] > v)
+ return b;
+ fprintf(stderr,
+ "Trying to add a sample that is too large into a histogram\n");
+ KMP_ASSERT(0);
+ return -1;
+}
+
+void logHistogram::addSample(double sample) {
+ if (sample == 0.0) {
+ zeroCount += 1;
+#ifdef KMP_DEBUG
+ _total++;
+ check();
+#endif
+ return;
+ }
+ KMP_DEBUG_ASSERT(std::isfinite(sample));
+ uint32_t bin = findBin(sample);
+ KMP_DEBUG_ASSERT(0 <= bin && bin < numBins);
+
+ bins[bin].count += 1;
+ bins[bin].total += sample;
+#ifdef KMP_DEBUG
+ _total++;
+ check();
+#endif
+}
+
+// This may not be the format we want, but it'll do for now
+std::string logHistogram::format(char unit) const {
+ std::stringstream result;
+
+ result << "Bin, Count, Total\n";
+ if (zeroCount) {
+ result << "0, " << formatSI(zeroCount, 9, ' ') << ", ",
+ formatSI(0.0, 9, unit);
+ if (count(minBin()) == 0)
+ return result.str();
+ result << "\n";
+ }
+ for (int i = minBin(); i <= maxBin(); i++) {
+ result << "10**" << i << "<=v<10**" << (i + 1) << ", "
+ << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit);
+ if (i != maxBin())
+ result << "\n";
+ }
+
+ return result.str();
+}
+
/* ************* explicitTimer member functions ************* */
-void explicitTimer::start(timer_e timerEnumValue) {
- startTime = tsc_tick_count::now();
+void explicitTimer::start(tsc_tick_count tick) {
+ startTime = tick;
totalPauseTime = 0;
if (timeStat::logEvent(timerEnumValue)) {
__kmp_stats_thread_ptr->incrementNestValue();
@@ -149,22 +245,19 @@ void explicitTimer::start(timer_e timerEnumValue) {
return;
}
-void explicitTimer::stop(timer_e timerEnumValue,
+void explicitTimer::stop(tsc_tick_count tick,
kmp_stats_list *stats_ptr /* = nullptr */) {
if (startTime.getValue() == 0)
return;
- tsc_tick_count finishTime = tsc_tick_count::now();
-
- // stat->addSample ((tsc_tick_count::now() - startTime).ticks());
- stat->addSample(((finishTime - startTime) - totalPauseTime).ticks());
+ stat->addSample(((tick - startTime) - totalPauseTime).ticks());
if (timeStat::logEvent(timerEnumValue)) {
if (!stats_ptr)
stats_ptr = __kmp_stats_thread_ptr;
stats_ptr->push_event(
startTime.getValue() - __kmp_stats_start_time.getValue(),
- finishTime.getValue() - __kmp_stats_start_time.getValue(),
+ tick.getValue() - __kmp_stats_start_time.getValue(),
__kmp_stats_thread_ptr->getNestValue(), timerEnumValue);
stats_ptr->decrementNestValue();
}
@@ -178,52 +271,67 @@ void explicitTimer::stop(timer_e timerEnumValue,
/* ************* partitionedTimers member functions ************* */
partitionedTimers::partitionedTimers() { timer_stack.reserve(8); }
-// add a timer to this collection of partitioned timers.
-void partitionedTimers::add_timer(explicit_timer_e timer_index,
- explicitTimer *timer_pointer) {
- KMP_DEBUG_ASSERT((int)timer_index < (int)EXPLICIT_TIMER_LAST + 1);
- timers[timer_index] = timer_pointer;
-}
-
// initialize the paritioned timers to an initial timer
-void partitionedTimers::init(timerPair init_timer_pair) {
+void partitionedTimers::init(explicitTimer timer) {
KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
- timer_stack.push_back(init_timer_pair);
- timers[init_timer_pair.get_index()]->start(init_timer_pair.get_timer());
+ timer_stack.push_back(timer);
+ timer_stack.back().start(tsc_tick_count::now());
}
// stop/save the current timer, and start the new timer (timer_pair)
// There is a special condition where if the current timer is equal to
// the one you are trying to push, then it only manipulates the stack,
// and it won't stop/start the currently running timer.
-void partitionedTimers::push(timerPair timer_pair) {
+void partitionedTimers::push(explicitTimer timer) {
// get the current timer
- // stop current timer
+ // pause current timer
// push new timer
// start the new timer
+ explicitTimer *current_timer, *new_timer;
+ size_t stack_size;
KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
- timerPair current_timer = timer_stack.back();
- timer_stack.push_back(timer_pair);
- if (current_timer != timer_pair) {
- timers[current_timer.get_index()]->pause();
- timers[timer_pair.get_index()]->start(timer_pair.get_timer());
- }
+ timer_stack.push_back(timer);
+ stack_size = timer_stack.size();
+ current_timer = &(timer_stack[stack_size - 2]);
+ new_timer = &(timer_stack[stack_size - 1]);
+ tsc_tick_count tick = tsc_tick_count::now();
+ current_timer->pause(tick);
+ new_timer->start(tick);
}
// stop/discard the current timer, and start the previously saved timer
void partitionedTimers::pop() {
// get the current timer
- // stop current timer
+ // stop current timer (record event/sample)
// pop current timer
- // get the new current timer and start it back up
- KMP_DEBUG_ASSERT(this->timer_stack.size() > 1);
- timerPair current_timer = timer_stack.back();
+ // get the new current timer and resume
+ explicitTimer *old_timer, *new_timer;
+ size_t stack_size = timer_stack.size();
+ KMP_DEBUG_ASSERT(stack_size > 1);
+ old_timer = &(timer_stack[stack_size - 1]);
+ new_timer = &(timer_stack[stack_size - 2]);
+ tsc_tick_count tick = tsc_tick_count::now();
+ old_timer->stop(tick);
+ new_timer->resume(tick);
timer_stack.pop_back();
- timerPair new_timer = timer_stack.back();
- if (current_timer != new_timer) {
- timers[current_timer.get_index()]->stop(current_timer.get_timer());
- timers[new_timer.get_index()]->resume();
- }
+}
+
+void partitionedTimers::exchange(explicitTimer timer) {
+ // get the current timer
+ // stop current timer (record event/sample)
+ // push new timer
+ // start the new timer
+ explicitTimer *current_timer, *new_timer;
+ size_t stack_size;
+ KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
+ tsc_tick_count tick = tsc_tick_count::now();
+ stack_size = timer_stack.size();
+ current_timer = &(timer_stack[stack_size - 1]);
+ current_timer->stop(tick);
+ timer_stack.pop_back();
+ timer_stack.push_back(timer);
+ new_timer = &(timer_stack[stack_size - 1]);
+ new_timer->start(tick);
}
// Wind up all the currently running timers.
@@ -234,10 +342,10 @@ void partitionedTimers::windup() {
while (timer_stack.size() > 1) {
this->pop();
}
+ // Pop the timer from the init() call
if (timer_stack.size() > 0) {
- timerPair last_timer = timer_stack.back();
+ timer_stack.back().stop(tsc_tick_count::now());
timer_stack.pop_back();
- timers[last_timer.get_index()]->stop(last_timer.get_timer());
}
}
@@ -255,8 +363,8 @@ void kmp_stats_event_vector::deallocate() {
// event2 or zero if event1 == event2. This sorts by start time (lowest to
// highest).
int compare_two_events(const void *event1, const void *event2) {
- kmp_stats_event *ev1 = (kmp_stats_event *)event1;
- kmp_stats_event *ev2 = (kmp_stats_event *)event2;
+ const kmp_stats_event *ev1 = RCAST(const kmp_stats_event *, event1);
+ const kmp_stats_event *ev2 = RCAST(const kmp_stats_event *, event2);
if (ev1->getStart() < ev2->getStart())
return -1;
@@ -353,9 +461,93 @@ const char *kmp_stats_output_module::plotFileName = NULL;
int kmp_stats_output_module::printPerThreadFlag = 0;
int kmp_stats_output_module::printPerThreadEventsFlag = 0;
+static char const *lastName(char *name) {
+ int l = strlen(name);
+ for (int i = l - 1; i >= 0; --i) {
+ if (name[i] == '.')
+ name[i] = '_';
+ if (name[i] == '/')
+ return name + i + 1;
+ }
+ return name;
+}
+
+/* Read the name of the executable from /proc/self/cmdline */
+static char const *getImageName(char *buffer, size_t buflen) {
+ FILE *f = fopen("/proc/self/cmdline", "r");
+ buffer[0] = char(0);
+ if (!f)
+ return buffer;
+
+ // The file contains char(0) delimited words from the commandline.
+ // This just returns the last filename component of the first word on the
+ // line.
+ size_t n = fread(buffer, 1, buflen, f);
+ if (n == 0) {
+ fclose(f);
+ KMP_CHECK_SYSFAIL("fread", 1)
+ }
+ fclose(f);
+ buffer[buflen - 1] = char(0);
+ return lastName(buffer);
+}
+
+static void getTime(char *buffer, size_t buflen, bool underscores = false) {
+ time_t timer;
+
+ time(&timer);
+
+ struct tm *tm_info = localtime(&timer);
+ if (underscores)
+ strftime(buffer, buflen, "%Y-%m-%d_%H%M%S", tm_info);
+ else
+ strftime(buffer, buflen, "%Y-%m-%d %H%M%S", tm_info);
+}
+
+/* Generate a stats file name, expanding prototypes */
+static std::string generateFilename(char const *prototype,
+ char const *imageName) {
+ std::string res;
+
+ for (int i = 0; prototype[i] != char(0); i++) {
+ char ch = prototype[i];
+
+ if (ch == '%') {
+ i++;
+ if (prototype[i] == char(0))
+ break;
+
+ switch (prototype[i]) {
+ case 't': // Insert time and date
+ {
+ char date[26];
+ getTime(date, sizeof(date), true);
+ res += date;
+ } break;
+ case 'e': // Insert executable name
+ res += imageName;
+ break;
+ case 'p': // Insert pid
+ {
+ std::stringstream ss;
+ ss << getpid();
+ res += ss.str();
+ } break;
+ default:
+ res += prototype[i];
+ break;
+ }
+ } else
+ res += ch;
+ }
+ return res;
+}
+
// init() is called very near the beginning of execution time in the constructor
// of __kmp_stats_global_output
void kmp_stats_output_module::init() {
+
+ fprintf(stderr, "*** Stats enabled OpenMP* runtime ***\n");
char *statsFileName = getenv("KMP_STATS_FILE");
eventsFileName = getenv("KMP_STATS_EVENTS_FILE");
plotFileName = getenv("KMP_STATS_PLOT_FILE");
@@ -364,22 +556,10 @@ void kmp_stats_output_module::init() {
// set the stats output filenames based on environment variables and defaults
if (statsFileName) {
- // append the process id to the output filename
- // events.csv --> events-pid.csv
- size_t index;
- std::string baseFileName, pid, suffix;
- std::stringstream ss;
- outputFileName = std::string(statsFileName);
- index = outputFileName.find_last_of('.');
- if (index == std::string::npos) {
- baseFileName = outputFileName;
- } else {
- baseFileName = outputFileName.substr(0, index);
- suffix = outputFileName.substr(index);
- }
- ss << getpid();
- pid = ss.str();
- outputFileName = baseFileName + "-" + pid + suffix;
+ char imageName[1024];
+ // Process any escapes (e.g., %p, %e, %t) in the name
+ outputFileName = generateFilename(
+ statsFileName, getImageName(&imageName[0], sizeof(imageName)));
}
eventsFileName = eventsFileName ? eventsFileName : "events.dat";
plotFileName = plotFileName ? plotFileName : "events.plt";
@@ -396,8 +576,6 @@ void kmp_stats_output_module::init() {
// will clear flag so that no event will be logged
timeStat::clearEventFlags();
}
-
- return;
}
void kmp_stats_output_module::setupEventColors() {
@@ -410,28 +588,43 @@ void kmp_stats_output_module::setupEventColors() {
globalColorIndex = (globalColorIndex + 1) % numGlobalColors;
}
}
- return;
}
void kmp_stats_output_module::printTimerStats(FILE *statsOut,
statistic const *theStats,
statistic const *totalStats) {
- fprintf(statsOut, "Timer, SampleCount, Min, "
- "Mean, Max, Total, SD\n");
+ fprintf(statsOut,
+ "Timer, SampleCount, Min, "
+ "Mean, Max, Total, SD\n");
for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
statistic const *stat = &theStats[s];
char tag = timeStat::noUnits(s) ? ' ' : 'T';
- fprintf(statsOut, "%-28s, %s\n", timeStat::name(s),
+ fprintf(statsOut, "%-35s, %s\n", timeStat::name(s),
stat->format(tag, true).c_str());
}
// Also print the Total_ versions of times.
for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
char tag = timeStat::noUnits(s) ? ' ' : 'T';
if (totalStats && !timeStat::noTotal(s))
- fprintf(statsOut, "Total_%-22s, %s\n", timeStat::name(s),
+ fprintf(statsOut, "Total_%-29s, %s\n", timeStat::name(s),
totalStats[s].format(tag, true).c_str());
}
+
+ // Print historgram of statistics
+ if (theStats[0].haveHist()) {
+ fprintf(statsOut, "\nTimer distributions\n");
+ for (int s = 0; s < TIMER_LAST; s++) {
+ statistic const *stat = &theStats[s];
+
+ if (stat->getCount() != 0) {
+ char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T';
+
+ fprintf(statsOut, "%s\n", timeStat::name(timer_e(s)));
+ fprintf(statsOut, "%s\n", stat->getHist()->format(tag).c_str());
+ }
+ }
+ }
}
void kmp_stats_output_module::printCounterStats(FILE *statsOut,
@@ -443,6 +636,18 @@ void kmp_stats_output_module::printCounterStats(FILE *statsOut,
fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(s)),
stat->format(' ', true).c_str());
}
+ // Print histogram of counters
+ if (theStats[0].haveHist()) {
+ fprintf(statsOut, "\nCounter distributions\n");
+ for (int s = 0; s < COUNTER_LAST; s++) {
+ statistic const *stat = &theStats[s];
+
+ if (stat->getCount() != 0) {
+ fprintf(statsOut, "%s\n", counter::name(counter_e(s)));
+ fprintf(statsOut, "%s\n", stat->getHist()->format(' ').c_str());
+ }
+ }
+ }
}
void kmp_stats_output_module::printCounters(FILE *statsOut,
@@ -480,9 +685,7 @@ void kmp_stats_output_module::windupExplicitTimers() {
for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
kmp_stats_list *ptr = *it;
ptr->getPartitionedTimers()->windup();
- for (int timer = 0; timer < EXPLICIT_TIMER_LAST; timer++) {
- ptr->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer, ptr);
- }
+ ptr->endLife();
}
}
@@ -547,6 +750,11 @@ void kmp_stats_output_module::printPloticusFile() {
return;
}
+static void outputEnvVariable(FILE *statsOut, char const *name) {
+ char const *value = getenv(name);
+ fprintf(statsOut, "# %s = %s\n", name, value ? value : "*unspecified*");
+}
+
/* Print some useful information about
* the date and time this experiment ran.
* the machine on which it ran.
@@ -570,6 +778,11 @@ void kmp_stats_output_module::printHeaderInfo(FILE *statsOut) {
else
fprintf(statsOut, "# Nominal frequency: %sz\n",
formatSI(double(__kmp_cpuinfo.frequency), 9, 'H').c_str());
+ outputEnvVariable(statsOut, "KMP_HW_SUBSET");
+ outputEnvVariable(statsOut, "KMP_AFFINITY");
+ outputEnvVariable(statsOut, "KMP_BLOCKTIME");
+ outputEnvVariable(statsOut, "KMP_LIBRARY");
+ fprintf(statsOut, "# Production runtime built " __DATE__ " " __TIME__ "\n");
#endif
}
@@ -665,7 +878,6 @@ void __kmp_reset_stats() {
for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
timeStat *timers = (*it)->getTimers();
counter *counters = (*it)->getCounters();
- explicitTimer *eTimers = (*it)->getExplicitTimers();
for (int t = 0; t < TIMER_LAST; t++)
timers[t].reset();
@@ -673,9 +885,6 @@ void __kmp_reset_stats() {
for (int c = 0; c < COUNTER_LAST; c++)
counters[c].reset();
- for (int t = 0; t < EXPLICIT_TIMER_LAST; t++)
- eTimers[t].reset();
-
// reset the event vector so all previous events are "erased"
(*it)->resetEventVector();
}
diff --git a/runtime/src/kmp_stats.h b/runtime/src/kmp_stats.h
index f8288de..be94843 100644
--- a/runtime/src/kmp_stats.h
+++ b/runtime/src/kmp_stats.h
@@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
#include "kmp_config.h"
+#include "kmp_debug.h"
#if KMP_STATS_ENABLED
/* Statistics accumulator.
@@ -36,7 +37,10 @@
/* Enable developer statistics here if you want them. They are more detailed
than is useful for application characterisation and are intended for the
runtime library developer. */
-// #define KMP_DEVELOPER_STATS 1
+#define KMP_DEVELOPER_STATS 0
+
+/* Enable/Disable histogram output */
+#define KMP_STATS_HIST 0
/*!
* @ingroup STATS_GATHERING
@@ -91,9 +95,9 @@ enum stats_state_e {
#define KMP_FOREACH_COUNTER(macro, arg) \
macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
macro(OMP_NESTED_PARALLEL, 0, arg) \
- macro(OMP_FOR_static, 0, arg) \
- macro(OMP_FOR_static_steal, 0, arg) \
- macro(OMP_FOR_dynamic, 0, arg) \
+ macro(OMP_LOOP_STATIC, 0, arg) \
+ macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
+ macro(OMP_LOOP_DYNAMIC, 0, arg) \
macro(OMP_DISTRIBUTE, 0, arg) \
macro(OMP_BARRIER, 0, arg) \
macro(OMP_CRITICAL, 0, arg) \
@@ -119,7 +123,7 @@ enum stats_state_e {
* @param arg a user defined argument to send to the user defined macro
*
* \details A timer collects multiple samples of some count in each thread and
- * then finally aggregates alll of the samples from all of the threads. For most
+ * then finally aggregates all of the samples from all of the threads. For most
* timers the printing code also provides an aggregation over the thread totals.
* These are printed as TOTAL_foo. The count is normally a time (in ticks),
* hence the name "timer". (But can be any value, so we use this for "number of
@@ -131,55 +135,59 @@ enum stats_state_e {
*/
// clang-format off
#define KMP_FOREACH_TIMER(macro, arg) \
- macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
- macro (FOR_static_scheduling, 0, arg) \
- macro (FOR_dynamic_scheduling, 0, arg) \
- macro (OMP_critical, 0, arg) \
- macro (OMP_critical_wait, 0, arg) \
- macro (OMP_single, 0, arg) \
- macro (OMP_master, 0, arg) \
- macro (OMP_idle, stats_flags_e::logEvent, arg) \
- macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
- macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
- macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
- macro (OMP_parallel, stats_flags_e::logEvent, arg) \
- macro (OMP_task_immediate, 0, arg) \
- macro (OMP_task_taskwait, 0, arg) \
- macro (OMP_task_taskyield, 0, arg) \
- macro (OMP_task_taskgroup, 0, arg) \
- macro (OMP_task_join_bar, 0, arg) \
- macro (OMP_task_plain_bar, 0, arg) \
- macro (OMP_serial, stats_flags_e::logEvent, arg) \
- macro (OMP_taskloop_scheduling, 0, arg) \
- macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,\
- arg) \
- macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
- arg) \
- macro (FOR_static_iterations, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (FOR_dynamic_iterations, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (FOR_static_steal_stolen, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (FOR_static_steal_chunks, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
+ macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
+ macro (OMP_parallel, stats_flags_e::logEvent, arg) \
+ macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
+ macro (OMP_loop_static, 0, arg) \
+ macro (OMP_loop_static_scheduling, 0, arg) \
+ macro (OMP_loop_dynamic, 0, arg) \
+ macro (OMP_loop_dynamic_scheduling, 0, arg) \
+ macro (OMP_critical, 0, arg) \
+ macro (OMP_critical_wait, 0, arg) \
+ macro (OMP_single, 0, arg) \
+ macro (OMP_master, 0, arg) \
+ macro (OMP_task_immediate, 0, arg) \
+ macro (OMP_task_taskwait, 0, arg) \
+ macro (OMP_task_taskyield, 0, arg) \
+ macro (OMP_task_taskgroup, 0, arg) \
+ macro (OMP_task_join_bar, 0, arg) \
+ macro (OMP_task_plain_bar, 0, arg) \
+ macro (OMP_taskloop_scheduling, 0, arg) \
+ macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
+ macro (OMP_idle, stats_flags_e::logEvent, arg) \
+ macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
+ macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
+ macro (OMP_serial, stats_flags_e::logEvent, arg) \
+ macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
+ arg) \
+ macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
+ arg) \
+ macro (OMP_loop_static_iterations, \
+ stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
+ macro (OMP_loop_dynamic_iterations, \
+ stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
+ KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
// clang-format on
-// OMP_start_end -- Time from when OpenMP is initialized until the
-// stats are printed at exit
-// OMP_serial -- Thread zero time executing serial code
-// OMP_work -- Elapsed time in code dispatched by a fork (measured
-// in the thread)
-// OMP_barrier -- Time at "real" barriers (includes task time)
-// FOR_static_scheduling -- Time spent doing scheduling for a static "for"
-// FOR_dynamic_scheduling -- Time spent doing scheduling for a dynamic "for"
-// OMP_idle -- Worker threads time spent waiting for inclusion in
-// a parallel region
-// OMP_plain_barrier -- Time spent in a barrier construct
-// OMP_fork_join_barrier -- Time spent in a the fork-join barrier surrounding a
-// parallel region
-// OMP_parallel -- Time spent inside a parallel construct
+// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
+// initializing OpenMP or being created by a master)
+// until the thread is destroyed
+// OMP_parallel -- Time thread spends executing work directly
+// within a #pragma omp parallel
+// OMP_parallel_overhead -- Time thread spends setting up a parallel region
+// OMP_loop_static -- Time thread spends executing loop iterations from
+// a statically scheduled loop
+// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
+// from a statically scheduled loop
+// OMP_loop_dynamic -- Time thread spends executing loop iterations from
+// a dynamically scheduled loop
+// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
+// from a dynamically scheduled loop
+// OMP_critical -- Time thread spends executing critical section
+// OMP_critical_wait -- Time thread spends waiting to enter
+// a critcal seciton
+// OMP_single -- Time spent executing a "single" region
+// OMP_master -- Time spent executing a "master" region
// OMP_task_immediate -- Time spent executing non-deferred tasks
// OMP_task_taskwait -- Time spent executing tasks inside a taskwait
// construct
@@ -190,17 +198,24 @@ enum stats_state_e {
// OMP_task_join_bar -- Time spent executing tasks inside a join barrier
// OMP_task_plain_bar -- Time spent executing tasks inside a barrier
// construct
-// OMP_single -- Time spent executing a "single" region
-// OMP_master -- Time spent executing a "master" region
+// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
+// construct
+// OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
+// inside implicit barrier at end of worksharing
+// construct
+// OMP_idle -- Time worker threads spend waiting for next
+// parallel region
+// OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
+// parallel region
+// OMP_join_barrier -- Time spent in a the join barrier surrounding a
+// parallel region
+// OMP_serial -- Time thread zero spends executing serial code
// OMP_set_numthreads -- Values passed to omp_set_num_threads
// OMP_PARALLEL_args -- Number of arguments passed to a parallel region
-// FOR_static_iterations -- Number of available parallel chunks of work in a
-// static for
-// FOR_dynamic_iterations -- Number of available parallel chunks of work in a
-// dynamic for
-// Both adjust for any chunking, so if there were an
-// iteration count of 20 but a chunk size of 10, we'd
-// record 2.
+// OMP_loop_static_iterations -- Number of iterations thread is assigned for
+// statically scheduled loops
+// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
+// dynamically scheduled loops
#if (KMP_DEVELOPER_STATS)
// Timers which are of interest to runtime library developers, not end users.
@@ -235,7 +250,11 @@ enum stats_state_e {
macro(USER_suspend, 0, arg) \
macro(KMP_allocate_team, 0, arg) \
macro(KMP_setup_icv_copy, 0, arg) \
- macro(USER_icv_copy, 0, arg)
+ macro(USER_icv_copy, 0, arg) \
+ macro (FOR_static_steal_stolen, \
+ stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
+ macro (FOR_static_steal_chunks, \
+ stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
#else
#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
#endif
@@ -272,53 +291,143 @@ enum explicit_timer_e {
enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
#undef ENUMERATE
-class timerPair {
- explicit_timer_e timer_index;
- timer_e timer;
+/*
+ * A logarithmic histogram. It accumulates the number of values in each power of
+ * ten bin. So 1<=x<10, 10<=x<100, ...
+ * Mostly useful where we have some big outliers and want to see information
+ * about them.
+ */
+class logHistogram {
+ enum {
+ numBins = 31, /* Number of powers of 10. If this changes you need to change
+ * the initializer for binMax */
+
+ /*
+ * If you want to use this to analyse values that may be less than 1, (for
+ * instance times in s), then the logOffset gives you negative powers.
+ * In our case here, we're just looking at times in ticks, or counts, so we
+ * can never see values with magnitude < 1 (other than zero), so we can set
+ * it to 0. As above change the initializer if you change this.
+ */
+ logOffset = 0
+ };
+ uint32_t KMP_ALIGN_CACHE zeroCount;
+ struct {
+ uint32_t count;
+ double total;
+ } bins[numBins];
+
+ static double binMax[numBins];
+
+#ifdef KMP_DEBUG
+ uint64_t _total;
+
+ void check() const {
+ uint64_t t = zeroCount;
+ for (int i = 0; i < numBins; i++)
+ t += bins[i].count;
+ KMP_DEBUG_ASSERT(t == _total);
+ }
+#else
+ void check() const {}
+#endif
public:
- timerPair(explicit_timer_e ti, timer_e t) : timer_index(ti), timer(t) {}
- inline explicit_timer_e get_index() const { return timer_index; }
- inline timer_e get_timer() const { return timer; }
- bool operator==(const timerPair &rhs) {
- return this->get_index() == rhs.get_index();
+ logHistogram() { reset(); }
+
+ logHistogram(logHistogram const &o) {
+ for (int i = 0; i < numBins; i++)
+ bins[i] = o.bins[i];
+#ifdef KMP_DEBUG
+ _total = o._total;
+#endif
}
- bool operator!=(const timerPair &rhs) { return !(*this == rhs); }
+
+ void reset() {
+ zeroCount = 0;
+ for (int i = 0; i < numBins; i++) {
+ bins[i].count = 0;
+ bins[i].total = 0;
+ }
+
+#ifdef KMP_DEBUG
+ _total = 0;
+#endif
+ }
+ uint32_t count(int b) const { return bins[b + logOffset].count; }
+ double total(int b) const { return bins[b + logOffset].total; }
+ static uint32_t findBin(double sample);
+
+ logHistogram &operator+=(logHistogram const &o) {
+ zeroCount += o.zeroCount;
+ for (int i = 0; i < numBins; i++) {
+ bins[i].count += o.bins[i].count;
+ bins[i].total += o.bins[i].total;
+ }
+#ifdef KMP_DEBUG
+ _total += o._total;
+ check();
+#endif
+
+ return *this;
+ }
+
+ void addSample(double sample);
+ int minBin() const;
+ int maxBin() const;
+
+ std::string format(char) const;
};
class statistic {
- double minVal;
+ double KMP_ALIGN_CACHE minVal;
double maxVal;
double meanVal;
double m2;
uint64_t sampleCount;
+ double offset;
+ bool collectingHist;
+ logHistogram hist;
public:
- statistic() { reset(); }
+ statistic(bool doHist = bool(KMP_STATS_HIST)) {
+ reset();
+ collectingHist = doHist;
+ }
statistic(statistic const &o)
: minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
- sampleCount(o.sampleCount) {}
-
+ sampleCount(o.sampleCount), offset(o.offset),
+ collectingHist(o.collectingHist), hist(o.hist) {}
+ statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
+ : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
+ sampleCount(sc), offset(0.0), collectingHist(false) {}
+ bool haveHist() const { return collectingHist; }
double getMin() const { return minVal; }
double getMean() const { return meanVal; }
double getMax() const { return maxVal; }
uint64_t getCount() const { return sampleCount; }
double getSD() const { return sqrt(m2 / sampleCount); }
double getTotal() const { return sampleCount * meanVal; }
+ logHistogram const *getHist() const { return &hist; }
+ void setOffset(double d) { offset = d; }
void reset() {
minVal = std::numeric_limits<double>::max();
- maxVal = -std::numeric_limits<double>::max();
+ maxVal = -minVal;
meanVal = 0.0;
m2 = 0.0;
sampleCount = 0;
+ offset = 0.0;
+ hist.reset();
}
void addSample(double sample);
void scale(double factor);
void scaleDown(double f) { scale(1. / f); }
+ void forceCount(uint64_t count) { sampleCount = count; }
statistic &operator+=(statistic const &other);
std::string format(char unit, bool total = false) const;
+ std::string formatHist(char unit) const { return hist.format(unit); }
};
struct statInfo {
@@ -359,39 +468,29 @@ public:
// to live on the stack of the thread, they're more work to use.
class explicitTimer {
timeStat *stat;
+ timer_e timerEnumValue;
tsc_tick_count startTime;
tsc_tick_count pauseStartTime;
tsc_tick_count::tsc_interval_t totalPauseTime;
public:
- explicitTimer()
- : stat(0), startTime(0), pauseStartTime(0), totalPauseTime() {}
- explicitTimer(timeStat *s)
- : stat(s), startTime(), pauseStartTime(0), totalPauseTime() {}
-
- void setStat(timeStat *s) { stat = s; }
- void start(timer_e timerEnumValue);
- void pause() { pauseStartTime = tsc_tick_count::now(); }
- void resume() { totalPauseTime += (tsc_tick_count::now() - pauseStartTime); }
- void stop(timer_e timerEnumValue, kmp_stats_list *stats_ptr = nullptr);
+ explicitTimer(timeStat *s, timer_e te)
+ : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
+ totalPauseTime() {}
+
+ // void setStat(timeStat *s) { stat = s; }
+ void start(tsc_tick_count tick);
+ void pause(tsc_tick_count tick) { pauseStartTime = tick; }
+ void resume(tsc_tick_count tick) {
+ totalPauseTime += (tick - pauseStartTime);
+ }
+ void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
void reset() {
startTime = 0;
pauseStartTime = 0;
totalPauseTime = 0;
}
-};
-
-// Where all you need is to time a block, this is enough.
-// (It avoids the need to have an explicit end, leaving the scope suffices.)
-class blockTimer : public explicitTimer {
- timer_e timerEnumValue;
-
-public:
- blockTimer(timeStat *s, timer_e newTimerEnumValue)
- : timerEnumValue(newTimerEnumValue), explicitTimer(s) {
- start(timerEnumValue);
- }
- ~blockTimer() { stop(timerEnumValue); }
+ timer_e get_type() const { return timerEnumValue; }
};
// Where you need to partition a threads clock ticks into separate states
@@ -402,14 +501,13 @@ public:
// versa
class partitionedTimers {
private:
- explicitTimer *timers[EXPLICIT_TIMER_LAST + 1];
- std::vector<timerPair> timer_stack;
+ std::vector<explicitTimer> timer_stack;
public:
partitionedTimers();
- void add_timer(explicit_timer_e timer_index, explicitTimer *timer_pointer);
- void init(timerPair timer_index);
- void push(timerPair timer_index);
+ void init(explicitTimer timer);
+ void exchange(explicitTimer timer);
+ void push(explicitTimer timer);
void pop();
void windup();
};
@@ -418,12 +516,11 @@ public:
// It avoids the need to have an explicit end, leaving the scope suffices.
class blockPartitionedTimer {
partitionedTimers *part_timers;
- timerPair timer_pair;
public:
- blockPartitionedTimer(partitionedTimers *pt, timerPair tp)
- : part_timers(pt), timer_pair(tp) {
- part_timers->push(timer_pair);
+ blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
+ : part_timers(pt) {
+ part_timers->push(timer);
}
~blockPartitionedTimer() { part_timers->pop(); }
};
@@ -609,7 +706,7 @@ class kmp_stats_list {
int gtid;
timeStat _timers[TIMER_LAST + 1];
counter _counters[COUNTER_LAST + 1];
- explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST + 1];
+ explicitTimer thread_life_timer;
partitionedTimers _partitionedTimers;
int _nestLevel; // one per thread
kmp_stats_event_vector _event_vector;
@@ -620,28 +717,21 @@ class kmp_stats_list {
public:
kmp_stats_list()
- : _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
- thread_is_idle_flag(0) {
-#define doInit(name, ignore1, ignore2) \
- getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name)); \
- _partitionedTimers.add_timer(EXPLICIT_TIMER_##name, \
- getExplicitTimer(EXPLICIT_TIMER_##name));
- KMP_FOREACH_EXPLICIT_TIMER(doInit, 0);
-#undef doInit
- }
+ : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
+ TIMER_OMP_worker_thread_life),
+ _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
+ thread_is_idle_flag(0) {}
~kmp_stats_list() {}
inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
- inline explicitTimer *getExplicitTimer(explicit_timer_e idx) {
- return &_explicitTimers[idx];
- }
inline partitionedTimers *getPartitionedTimers() {
return &_partitionedTimers;
}
inline timeStat *getTimers() { return _timers; }
inline counter *getCounters() { return _counters; }
- inline explicitTimer *getExplicitTimers() { return _explicitTimers; }
inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
+ inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
+ inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
inline void resetEventVector() { _event_vector.reset(); }
inline void incrementNestValue() { _nestLevel++; }
inline int getNestValue() { return _nestLevel; }
@@ -774,22 +864,6 @@ extern kmp_stats_output_module __kmp_stats_output;
// Simple, standard interfaces that drop out completely if stats aren't enabled
/*!
- * \brief Uses specified timer (name) to time code block.
- *
- * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
- *
- * \details Use KMP_TIME_BLOCK(name) macro to time a code block. This will
- * record the time taken in the block and use the destructor to stop the timer.
- * Convenient! With this definition you can't have more than one KMP_TIME_BLOCK
- * in the same code block. I don't think that's a problem.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_TIME_BLOCK(name) \
- blockTimer __BLOCKTIME__(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
- TIMER_##name)
-
-/*!
* \brief Adds value to specified timer (name).
*
* @param name timer name as specified under the KMP_FOREACH_TIMER() macro
@@ -817,44 +891,6 @@ extern kmp_stats_output_module __kmp_stats_output;
__kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
/*!
- * \brief "Starts" an explicit timer which will need a corresponding
- * KMP_STOP_EXPLICIT_TIMER() macro.
- *
- * @param name explicit timer name as specified under the
- * KMP_FOREACH_EXPLICIT_TIMER() macro
- *
- * \details Use to start a timer. This will need a corresponding
- * KMP_STOP_EXPLICIT_TIMER() macro to stop the timer unlike the
- * KMP_TIME_BLOCK(name) macro which has an implicit stopping macro at the end
- * of the code block. All explicit timers are stopped at library exit time
- * before the final statistics are outputted.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_START_EXPLICIT_TIMER(name) \
- __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name) \
- ->start(TIMER_##name)
-
-/*!
- * \brief "Stops" an explicit timer.
- *
- * @param name explicit timer name as specified under the
- * KMP_FOREACH_EXPLICIT_TIMER() macro
- *
- * \details Use KMP_STOP_EXPLICIT_TIMER(name) to stop a timer. When this is
- * done, the time between the last KMP_START_EXPLICIT_TIMER(name) and this
- * KMP_STOP_EXPLICIT_TIMER(name) will be added to the timer's stat value. The
- * timer will then be reset. After the KMP_STOP_EXPLICIT_TIMER(name) macro is
- * called, another call to KMP_START_EXPLICIT_TIMER(name) will start the timer
- * once again.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_STOP_EXPLICIT_TIMER(name) \
- __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name) \
- ->stop(TIMER_##name)
-
-/*!
* \brief Outputs the current thread statistics and reset them.
*
* @param heading_string heading put above the final stats output
@@ -881,21 +917,26 @@ extern kmp_stats_output_module __kmp_stats_output;
* @ingroup STATS_GATHERING
*/
#define KMP_INIT_PARTITIONED_TIMERS(name) \
- __kmp_stats_thread_ptr->getPartitionedTimers()->init( \
- timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+ __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
+ __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
#define KMP_TIME_PARTITIONED_BLOCK(name) \
blockPartitionedTimer __PBLOCKTIME__( \
__kmp_stats_thread_ptr->getPartitionedTimers(), \
- timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+ explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
+ TIMER_##name))
#define KMP_PUSH_PARTITIONED_TIMER(name) \
- __kmp_stats_thread_ptr->getPartitionedTimers()->push( \
- timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+ __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
+ __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
#define KMP_POP_PARTITIONED_TIMER() \
__kmp_stats_thread_ptr->getPartitionedTimers()->pop()
+#define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
+ __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
+ __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
#define KMP_SET_THREAD_STATE(state_name) \
__kmp_stats_thread_ptr->setState(state_name)
diff --git a/runtime/src/kmp_stats_timing.cpp b/runtime/src/kmp_stats_timing.cpp
index fe85dc4..2fcbaab 100644
--- a/runtime/src/kmp_stats_timing.cpp
+++ b/runtime/src/kmp_stats_timing.cpp
@@ -95,10 +95,11 @@ std::string formatSI(double interval, int width, char unit) {
static struct {
double scale;
char prefix;
- } ranges[] = {{1.e12, 'f'}, {1.e9, 'p'}, {1.e6, 'n'}, {1.e3, 'u'},
- {1.0, 'm'}, {1.e-3, ' '}, {1.e-6, 'k'}, {1.e-9, 'M'},
- {1.e-12, 'G'}, {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'},
- {1.e-24, 'Z'}, {1.e-27, 'Y'}};
+ } ranges[] = {{1.e21, 'y'}, {1.e18, 'z'}, {1.e15, 'a'}, {1.e12, 'f'},
+ {1.e9, 'p'}, {1.e6, 'n'}, {1.e3, 'u'}, {1.0, 'm'},
+ {1.e-3, ' '}, {1.e-6, 'k'}, {1.e-9, 'M'}, {1.e-12, 'G'},
+ {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'}, {1.e-24, 'Z'},
+ {1.e-27, 'Y'}};
if (interval == 0.0) {
os << std::setw(width - 3) << std::right << "0.00" << std::setw(3)
diff --git a/runtime/src/z_Linux_util.cpp b/runtime/src/z_Linux_util.cpp
index fd77e9e..8c59b43 100644
--- a/runtime/src/z_Linux_util.cpp
+++ b/runtime/src/z_Linux_util.cpp
@@ -510,7 +510,7 @@ static void *__kmp_launch_worker(void *thr) {
#if KMP_STATS_ENABLED
// set thread local index to point to thread-specific stats
__kmp_stats_thread_ptr = ((kmp_info_t *)thr)->th.th_stats;
- KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
+ __kmp_stats_thread_ptr->startLife();
KMP_SET_THREAD_STATE(IDLE);
KMP_INIT_PARTITIONED_TIMERS(OMP_idle);
#endif