From 747e94ae3d1b4c9bf5380e569f614eb9040b79e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Oct 2010 13:51:48 -0400 Subject: ring-buffer: Make write slow path out of line Gcc inlines the slow path of the ring buffer write which can hurt performance. This patch simply forces the slow path function rb_move_tail() to always be a function. The ring_buffer_benchmark module with reader_disabled=1 shows that this patch changes the time to record an event from 135 ns to 132 ns. (3 ns or 2.22% improvement) Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bca96377fd4..0b88df849a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1823,7 +1823,10 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } -static struct ring_buffer_event * +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, struct buffer_page *tail_page, u64 *ts) @@ -1943,7 +1946,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) + if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, tail_page, ts); -- cgit v1.2.3 From e8bc43e84fada397af1b677b07dbf26e6ac78fcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 10:58:02 -0400 Subject: ring-buffer: Pass timestamp by value and not by reference The original code for the ring buffer had locations that modified the timestamp and that change was used by the callers. Now, the timestamp is not reused by the callers and there is no reason to pass it by reference. By changing the call to pass by value, lets gcc optimize the code a bit more where it can store the timestamp in a register and not worry about updating the reference. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0b88df849a5..c8ce6bde7fa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1829,7 +1829,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 *ts) + struct buffer_page *tail_page, u64 ts) { struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; @@ -1912,8 +1912,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer); - next_page->page->time_stamp = *ts; + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; } out_again: @@ -1932,7 +1932,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) + unsigned type, unsigned long length, u64 ts) { struct buffer_page *tail_page; struct ring_buffer_event *event; @@ -1965,7 +1965,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * its timestamp. */ if (!tail) - tail_page->page->time_stamp = *ts; + tail_page->page->time_stamp = ts; return event; } @@ -2008,7 +2008,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 *ts, u64 *delta) + u64 ts, u64 *delta) { struct ring_buffer_event *event; int ret; @@ -2016,7 +2016,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, WARN_ONCE(*delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", (unsigned long long)*delta, - (unsigned long long)*ts, + (unsigned long long)ts, (unsigned long long)cpu_buffer->write_stamp); /* @@ -2051,7 +2051,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = 0; } } - cpu_buffer->write_stamp = *ts; + cpu_buffer->write_stamp = ts; /* let the caller know this was the commit */ ret = 1; } else { @@ -2175,7 +2175,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); + commit = rb_add_time_stamp(cpu_buffer, ts, &delta); if (commit == -EBUSY) goto out_fail; @@ -2187,7 +2187,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, } get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + event = __rb_reserve_next(cpu_buffer, 0, length, ts); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; -- cgit v1.2.3 From f25106aeab7408394b9dd707e5ecf557e269c723 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 12:40:12 -0400 Subject: ring-buffer: Pass delta by value and not by reference The delta between events is passed to the timestamp code by reference and the timestamp code will reset the value. But it can be reset from the caller. No need to pass it in by reference. By changing the call to pass by value, lets gcc optimize the code a bit more where it can store the delta in a register and not worry about updating the reference. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c8ce6bde7fa..3af77cd47f2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2008,14 +2008,14 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 ts, u64 *delta) + u64 ts, u64 delta) { struct ring_buffer_event *event; int ret; - WARN_ONCE(*delta > (1ULL << 59), + WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, + (unsigned long long)delta, (unsigned long long)ts, (unsigned long long)cpu_buffer->write_stamp); @@ -2041,8 +2041,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, * and if we can't just make it zero. */ if (rb_event_index(event)) { - event->time_delta = *delta & TS_MASK; - event->array[0] = *delta >> TS_SHIFT; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; } else { /* try to discard, since we do not need this */ if (!rb_try_to_discard(cpu_buffer, event)) { @@ -2064,8 +2064,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, ret = 0; } - *delta = 0; - return ret; } @@ -2175,7 +2173,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - commit = rb_add_time_stamp(cpu_buffer, ts, &delta); + commit = rb_add_time_stamp(cpu_buffer, ts, delta); + delta = 0; + if (commit == -EBUSY) goto out_fail; -- cgit v1.2.3 From 69d1b839f7eee347e357b3f6cce7f630cc6ff93d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Oct 2010 18:18:05 -0400 Subject: ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++--------------------- 1 file changed, 142 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3af77cd47f2..f50f43107e9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -224,6 +224,9 @@ enum { RB_LEN_TIME_STAMP = 16, }; +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) return length + RB_EVNT_HDR_SIZE; } -/* inline for ring buffer fast paths */ -static unsigned +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { @@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) return 0; } +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - unsigned length = rb_event_length(event); + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; @@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) @@ -1546,6 +1583,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + /** * ring_buffer_update_event - update event type and data * @event: the even to update @@ -1558,28 +1614,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * data field. */ static void -rb_update_event(struct ring_buffer_event *event, - unsigned type, unsigned length) +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) { - event->type_len = type; - - switch (type) { - - case RINGBUF_TYPE_PADDING: - case RINGBUF_TYPE_TIME_EXTEND: - case RINGBUF_TYPE_TIME_STAMP: - break; + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; - case 0: - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - event->array[0] = length; - else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); - break; - default: - BUG(); + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } /* @@ -1932,12 +1991,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 ts) + unsigned long length, u64 ts, + u64 delta, int add_timestamp) { struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1954,11 +2022,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(event, type, length); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update @@ -1980,7 +2046,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long addr; new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); + old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= PAGE_MASK; @@ -2006,67 +2072,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static int -rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 ts, u64 delta) -{ - struct ring_buffer_event *event; - int ret; - - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); - - /* - * The delta is too big, we to add a - * new timestamp. - */ - event = __rb_reserve_next(cpu_buffer, - RINGBUF_TYPE_TIME_EXTEND, - RB_LEN_TIME_EXTEND, - ts); - if (!event) - return -EBUSY; - - if (PTR_ERR(event) == -EAGAIN) - return -EAGAIN; - - /* Only a commited time event can update the write stamp */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * If this is the first on the page, then it was - * updated with the page itself. Try to discard it - * and if we can't just make it zero. - */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* try to discard, since we do not need this */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - } - cpu_buffer->write_stamp = ts; - /* let the caller know this was the commit */ - ret = 1; - } else { - /* Try to discard the event */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - } - ret = 0; - } - - return ret; -} - static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); @@ -2111,9 +2116,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta = 0; - int commit = 0; + u64 ts, delta; int nr_loops = 0; + int add_timestamp; rb_start_commit(cpu_buffer); @@ -2134,6 +2139,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, length = rb_calculate_event_length(length); again: + add_timestamp = 0; + delta = 0; + /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2172,33 +2180,24 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - - commit = rb_add_time_stamp(cpu_buffer, ts, delta); - delta = 0; - - if (commit == -EBUSY) - goto out_fail; - - if (commit == -EAGAIN) - goto again; - - RB_WARN_ON(cpu_buffer, commit < 0); + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp); + add_timestamp = 1; } } get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, ts); + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) goto out_fail; - if (!rb_event_is_commit(cpu_buffer, event)) - delta = 0; - - event->time_delta = delta; - return event; out_fail: @@ -2311,12 +2310,28 @@ static void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { + u64 delta; + /* * The event first in the commit queue updates the * time stamp. */ - if (rb_event_is_commit(cpu_buffer, event)) - cpu_buffer->write_stamp += event->time_delta; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, @@ -2356,6 +2371,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; @@ -3043,12 +3061,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written, or from discarded - * commits. The most that we can have is the number on a single page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -3124,14 +3142,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a timestamp is encountered. - * We can get multiple timestamps by nested interrupts or also - * if filtering is on (discarding commits). Since discarding - * commits can be frequent we can get a lot of timestamps. - * But we limit them by not adding timestamps if they begin - * at the start of a page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -3829,7 +3845,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (len > (commit - read)) len = (commit - read); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); if (len < size) goto out_unlock; @@ -3851,7 +3868,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, break; event = rb_reader_event(cpu_buffer); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); } while (len > size); /* update bpage */ -- cgit v1.2.3 From 140ff89127c74b1b1c1b0152a36ea3720ccf6bc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Oct 2010 10:50:30 -0400 Subject: ring-buffer: Remove condition to add timestamp in fast path There's a condition to check if we should add a time extend or not in the fast path. But this condition is racey (in the sense that we can add a unnecessary time extend, but nothing that can break anything). We later check if the time or event time delta should be zero or have real data in it (not racey), making this first check redundant. This check may help save space once in a while, but really is not worth the hassle to try to save some space that happens at most 134 ms at a time. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f50f43107e9..d9f3e7a8213 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2119,6 +2119,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, u64 ts, delta; int nr_loops = 0; int add_timestamp; + u64 diff; rb_start_commit(cpu_buffer); @@ -2155,29 +2156,13 @@ rb_reserve_next_event(struct ring_buffer *buffer, goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; - /* - * Only the first commit can update the timestamp. - * Yes there is a race here. If an interrupt comes in - * just after the conditional and it traces too, then it - * will also check the deltas. More than one timestamp may - * also be made. But only the entry that did the actual - * commit will be something other than zero. - */ - if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer))) { - u64 diff; - - diff = ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (unlikely(ts < cpu_buffer->write_stamp)) - goto get_event; + /* make sure this diff is calculated here */ + barrier(); + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), @@ -2189,7 +2174,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, } } - get_event: event = __rb_reserve_next(cpu_buffer, length, ts, delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) -- cgit v1.2.3 From d9abde2138e0a00a0d7e44676928efa0ef629d48 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 13:17:08 -0400 Subject: ring-buffer: Micro-optimize with some strategic inlining By using inline and noinline, we are able to make the fast path of recording an event 4% faster. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9f3e7a8213..f5007d0d932 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2078,7 +2078,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } -static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2193,13 +2193,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, #define TRACE_RECURSIVE_DEPTH 16 -static int trace_recursive_lock(void) +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void) { - current->trace_recursion++; - - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) - return 0; - /* Disable all tracing before we do anything else */ tracing_off_permanent(); @@ -2211,10 +2207,21 @@ static int trace_recursive_lock(void) in_nmi()); WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + trace_recursive_fail(); + return -1; } -static void trace_recursive_unlock(void) +static inline void trace_recursive_unlock(void) { WARN_ON_ONCE(!current->trace_recursion); -- cgit v1.2.3 From b8b2663bd7c9da04ac804659b9f617c199d0252c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 13:23:25 -0400 Subject: ring-buffer: Remove unused macro RB_TIMESTAMPS_PER_PAGE With the binding of time extends to events we no longer need to use the macro RB_TIMESTAMPS_PER_PAGE. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5007d0d932..ad25490f8b4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -441,9 +441,6 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -/* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) - int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; -- cgit v1.2.3 From dd49a38cf30944be27892c10b1c0e5b3fa73bcb2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 21:51:26 -0400 Subject: tracing: Do not limit the size of the number of CPU buffers The tracing per_cpu buffers were limited to 999 CPUs for a mear savings in stack space of a char array. Up the array to 30 characters which is more than enough to hold a 64 bit number. Reported-by: Robin Holt Suggested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 001bcd2ccf4..82d9b8106cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); struct dentry *d_cpu; - /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ - char cpu_dir[7]; + char cpu_dir[30]; /* 30 characters should be more than enough */ - if (cpu > 999 || cpu < 0) - return; - - sprintf(cpu_dir, "cpu%ld", cpu); + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); -- cgit v1.2.3 From f4bc6bb2d562703eafc895c37e7be20906de139d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Oct 2010 15:00:13 +0200 Subject: tracing: Cleanup the convoluted softirq tracepoints With the addition of trace_softirq_raise() the softirq tracepoint got even more convoluted. Why the tracepoints take two pointers to assign an integer is beyond my comprehension. But adding an extra case which treats the first pointer as an unsigned long when the second pointer is NULL including the back and forth type casting is just horrible. Convert the softirq tracepoints to take a single unsigned int argument for the softirq vector number and fix the call sites. Signed-off-by: Thomas Gleixner LKML-Reference: Acked-by: Peter Zijlstra Acked-by: mathieu.desnoyers@efficios.com Cc: Frederic Weisbecker Cc: Steven Rostedt --- kernel/softirq.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73..b3cb1dc1579 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -212,18 +212,20 @@ restart: do { if (pending & 1) { + unsigned int vec_nr = h - softirq_vec; int prev_count = preempt_count(); - kstat_incr_softirqs_this_cpu(h - softirq_vec); - trace_softirq_entry(h, softirq_vec); + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); h->action(h); - trace_softirq_exit(h, softirq_vec); + trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %s %p" + printk(KERN_ERR "huh, entered softirq %u %s %p" "with preempt_count %08x," - " exited with %08x?\n", h - softirq_vec, - softirq_to_name[h - softirq_vec], - h->action, prev_count, preempt_count()); + " exited with %08x?\n", vec_nr, + softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); preempt_count() = prev_count; } -- cgit v1.2.3 From 9ffcfa6f1f63eeac15555b745c292eb9f59130f6 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 20 Oct 2010 15:25:01 +0200 Subject: perf_events: Revert: Fix transaction recovery in group_sched_in() This patch reverts commit 8e5fc1a (perf_events: Fix transaction recovery in group_sched_in()) because it had one flaw in case the group could never be scheduled. It would cause time_enabled to get negative. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cbeeeb7.0aefd80a.6e40.0e2f@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 76 +++++++++-------------------------------------------- 1 file changed, 13 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f309e8014c7..39afdb07d75 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event) return event->cpu == -1 || event->cpu == smp_processor_id(); } -static int -__event_sched_out(struct perf_event *event, +static void +event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event, } if (event->state != PERF_EVENT_STATE_ACTIVE) - return 0; + return; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } + event->tstamp_stopped = ctx->time; event->pmu->del(event, 0); event->oncpu = -1; @@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event, ctx->nr_active--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - return 1; -} - -static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - int ret; - - ret = __event_sched_out(event, cpuctx, ctx); - if (ret) - event->tstamp_stopped = ctx->time; } static void @@ -664,7 +652,7 @@ retry: } static int -__event_sched_in(struct perf_event *event, +event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -684,6 +672,8 @@ __event_sched_in(struct perf_event *event, return -EAGAIN; } + event->tstamp_running += ctx->time - event->tstamp_stopped; + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -694,35 +684,6 @@ __event_sched_in(struct perf_event *event, return 0; } -static inline int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - int ret = __event_sched_in(event, cpuctx, ctx); - if (ret) - return ret; - event->tstamp_running += ctx->time - event->tstamp_stopped; - return 0; -} - -static void -group_commit_event_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event; - u64 now = ctx->time; - - group_event->tstamp_running += now - group_event->tstamp_stopped; - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - event->tstamp_running += now - event->tstamp_stopped; - } -} - static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, @@ -736,13 +697,7 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu); - /* - * use __event_sched_in() to delay updating tstamp_running - * until the transaction is committed. In case of failure - * we will keep an unmodified tstamp_running which is a - * requirement to get correct timing information - */ - if (__event_sched_in(group_event, cpuctx, ctx)) { + if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); return -EAGAIN; } @@ -751,31 +706,26 @@ group_sched_in(struct perf_event *group_event, * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (__event_sched_in(event, cpuctx, ctx)) { + if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } - if (!pmu->commit_txn(pmu)) { - /* commit tstamp_running */ - group_commit_event_sched_in(group_event, cpuctx, ctx); + if (!pmu->commit_txn(pmu)) return 0; - } + group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: - * - * use __event_sched_out() to avoid updating tstamp_stopped - * because the event never actually ran */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) break; - __event_sched_out(event, cpuctx, ctx); + event_sched_out(event, cpuctx, ctx); } - __event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, cpuctx, ctx); pmu->cancel_txn(pmu); -- cgit v1.2.3 From d7842da470f244d258f21c5f72cd8388b3541d04 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 20 Oct 2010 15:25:01 +0200 Subject: perf_events: Fix for transaction recovery in group_sched_in() This new version (see commit 8e5fc1a) is much simpler and ensures that in case of error in group_sched_in() during event_sched_in(), the events up to the failed event go through regular event_sched_out(). But the failed event and the remaining events in the group have their timings adjusted as if they had also gone through event_sched_in() and event_sched_out(). This ensures timing uniformity across all events in a group. This also takes care of the tstamp_stopped problem in case the group could never be scheduled. The tstamp_stopped is updated as if the event had actually run. With this patch, the following now reports correct time_enabled, in case the NMI watchdog is active: $ task -e unhalted_core_cycles,instructions_retired,baclears,baclears noploop 1 noploop for 1 seconds 0 unhalted_core_cycles (100.00% scaling, ena=997,552,872, run=0) 0 instructions_retired (100.00% scaling, ena=997,552,872, run=0) 0 baclears (100.00% scaling, ena=997,552,872, run=0) 0 baclears (100.00% scaling, ena=997,552,872, run=0) And the older test case also works: $ task -einstructions_retired,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 1680885 instructions_retired (69.39% scaling, ena=950756, run=291006) 10735 baclears (69.39% scaling, ena=950756, run=291006) 10735 baclears (69.39% scaling, ena=950756, run=291006) 0 unhalted_core_cycles (100.00% scaling, ena=817932, run=0) 0 baclears (100.00% scaling, ena=817932, run=0) 0 baclears (100.00% scaling, ena=817932, run=0) Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cbeeebc.8ee7d80a.5a28.0d5f@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 39afdb07d75..517d827f498 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -691,6 +691,8 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu; + u64 now = ctx->time; + bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -719,11 +721,27 @@ group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * The events up to the failed event are scheduled out normally, + * tstamp_stopped will be updated. + * + * The failed events and the remaining siblings need to have + * their timings updated as if they had gone thru event_sched_in() + * and event_sched_out(). This is required to get consistent timings + * across the group. This also takes care of the case where the group + * could never be scheduled by ensuring tstamp_stopped is set to mark + * the time the event was actually stopped, such that time delta + * calculation in update_event_times() is correct. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - break; - event_sched_out(event, cpuctx, ctx); + simulate = true; + + if (simulate) { + event->tstamp_running += now - event->tstamp_stopped; + event->tstamp_stopped = now; + } else { + event_sched_out(event, cpuctx, ctx); + } } event_sched_out(group_event, cpuctx, ctx); -- cgit v1.2.3 From aa7b250c252cc8e6b1daf0e1eada5eba42a1a68d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 21 Oct 2010 22:17:19 -0700 Subject: tracing: Fix 'faild' -> 'failed' typo Signed-off-by: Joe Perches Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jiri Kosina LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 544301d29de..b8d2852baa4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -648,7 +648,7 @@ static int register_trace_probe(struct trace_probe *tp) } ret = register_probe_event(tp); if (ret) { - pr_warning("Faild to register probe event(%d)\n", ret); + pr_warning("Failed to register probe event(%d)\n", ret); goto end; } -- cgit v1.2.3 From 43948f50276eca010a22726860dfe9a4e8130136 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 25 Oct 2010 22:18:01 +0900 Subject: kprobes: Remove redundant text_mutex lock in optimize Remove text_mutex locking in optimize_all_kprobes, because this function doesn't modify text. It simply queues probes on optimization list for kprobe_optimizer worker thread. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Namhyung Kim Cc: Jason Baron Cc: Peter Zijlstra LKML-Reference: <20101025131801.19160.70939.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ec4210c6501..7c44133f51e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_all_disarmed; -static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +/* This protects kprobe_table and optimizing_list */ +static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) } #ifdef CONFIG_SYSCTL +/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; @@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void) return; kprobes_allow_optimization = true; - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } - mutex_unlock(&text_mutex); printk(KERN_INFO "Kprobes globally optimized\n"); } +/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; -- cgit v1.2.3