diff options
-rw-r--r-- | Documentation/trace/ftrace.rst | 21 | ||||
-rw-r--r-- | drivers/scsi/qla2xxx/qla_os.c | 2 | ||||
-rw-r--r-- | include/linux/ring_buffer.h | 18 | ||||
-rw-r--r-- | include/linux/trace.h | 4 | ||||
-rw-r--r-- | include/linux/trace_seq.h | 9 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 733 | ||||
-rw-r--r-- | kernel/trace/ring_buffer_benchmark.c | 10 | ||||
-rw-r--r-- | kernel/trace/trace.c | 234 | ||||
-rw-r--r-- | kernel/trace/trace.h | 2 | ||||
-rw-r--r-- | kernel/trace/trace_boot.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 107 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 49 | ||||
-rw-r--r-- | kernel/trace/trace_seq.c | 3 | ||||
-rw-r--r-- | samples/ftrace/sample-trace-array.c | 2 | ||||
-rw-r--r-- | tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc | 95 | ||||
-rw-r--r-- | tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc | 82 |
16 files changed, 999 insertions, 374 deletions
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 16122a8895ba..7e7b8ec17934 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -218,6 +218,27 @@ of ftrace. Here is a list of some of the key files: This displays the total combined size of all the trace buffers. + buffer_subbuf_size_kb: + + This sets or displays the sub buffer size. The ring buffer is broken up + into several same size "sub buffers". An event can not be bigger than + the size of the sub buffer. Normally, the sub buffer is the size of the + architecture's page (4K on x86). The sub buffer also contains meta data + at the start which also limits the size of an event. That means when + the sub buffer is a page size, no event can be larger than the page + size minus the sub buffer meta data. + + Note, the buffer_subbuf_size_kb is a way for the user to specify the + minimum size of the subbuffer. The kernel may make it bigger due to the + implementation details, or simply fail the operation if the kernel can + not handle the request. + + Changing the sub buffer size allows for events to be larger than the + page size. + + Note: When changing the sub-buffer size, tracing is stopped and any + data in the ring buffer and the snapshot buffer will be discarded. + free_buffer: If a process is performing tracing, and the ring buffer should be diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 03348f605c2e..dd674378f2f3 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -2889,7 +2889,7 @@ static void qla2x00_iocb_work_fn(struct work_struct *work) static void qla_trace_init(void) { - qla_trc_array = trace_array_get_by_name("qla2xxx"); + qla_trc_array = trace_array_get_by_name("qla2xxx", NULL); if (!qla_trc_array) { ql_log(ql_log_fatal, NULL, 0x0001, "Unable to create qla2xxx trace instance, instance logging will be disabled.\n"); diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 782e14f62201..fa802db216f9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter); bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter); unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu); +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer); void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer); @@ -191,15 +192,24 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer); size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu); size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu); -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data); -int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, +struct buffer_data_read_page; +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *page); +int ring_buffer_read_page(struct trace_buffer *buffer, + struct buffer_data_read_page *data_page, size_t len, int cpu, int full); +void *ring_buffer_read_page_data(struct buffer_data_read_page *page); struct trace_seq; int ring_buffer_print_entry_header(struct trace_seq *s); -int ring_buffer_print_page_header(struct trace_seq *s); +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s); + +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer); +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order); +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/include/linux/trace.h b/include/linux/trace.h index 2a70a447184c..fdcd76b7be83 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); -struct trace_array *trace_array_get_by_name(const char *name); +struct trace_array *trace_array_get_by_name(const char *name, const char *systems); int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ @@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr) static inline void trace_array_put(struct trace_array *tr) { } -static inline struct trace_array *trace_array_get_by_name(const char *name) +static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { return NULL; } diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 3691e0e76a1a..9ec229dfddaa 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -8,11 +8,14 @@ /* * Trace sequences are used to allow a function to call several other functions - * to create a string of data to use (up to a max of PAGE_SIZE). + * to create a string of data to use. */ +#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \ + (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) + struct trace_seq { - char buffer[PAGE_SIZE]; + char buffer[TRACE_SEQ_BUFFER_SIZE]; struct seq_buf seq; size_t readpos; int full; @@ -21,7 +24,7 @@ struct trace_seq { static inline void trace_seq_init(struct trace_seq *s) { - seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); + seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE); s->full = 0; s->readpos = 0; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9286f88fcd32..13aaf5e85b81 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include <linux/cpu.h> #include <linux/oom.h> +#include <asm/local64.h> #include <asm/local.h> /* @@ -317,6 +318,11 @@ struct buffer_data_page { unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; +struct buffer_data_read_page { + unsigned order; /* order of the page */ + struct buffer_data_page *data; /* actual data, stored in this page */ +}; + /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer @@ -331,6 +337,7 @@ struct buffer_page { unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ + unsigned order; /* order of the page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -361,7 +368,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) static void free_buffer_page(struct buffer_page *bpage) { - free_page((unsigned long)bpage->page); + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -373,41 +380,6 @@ static inline bool test_time_stamp(u64 delta) return !!(delta & TS_DELTA_TEST); } -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) - -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) - -int ring_buffer_print_page_header(struct trace_seq *s) -{ - struct buffer_data_page field; - - trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); - - return !trace_seq_has_overflowed(s); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -463,27 +435,9 @@ enum { RB_CTX_MAX }; -#if BITS_PER_LONG == 32 -#define RB_TIME_32 -#endif - -/* To test on 64 bit machines */ -//#define RB_TIME_32 - -#ifdef RB_TIME_32 - -struct rb_time_struct { - local_t cnt; - local_t top; - local_t bottom; - local_t msb; -}; -#else -#include <asm/local64.h> struct rb_time_struct { local64_t time; }; -#endif typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 @@ -557,6 +511,10 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + + unsigned int subbuf_size; + unsigned int subbuf_order; + unsigned int max_data_size; }; struct ring_buffer_iter { @@ -570,150 +528,48 @@ struct ring_buffer_iter { u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; + size_t event_size; int missed_events; }; -#ifdef RB_TIME_32 - -/* - * On 32 bit machines, local64_t is very expensive. As the ring - * buffer doesn't need all the features of a true 64 bit atomic, - * on 32 bit, it uses these functions (64 still uses local64_t). - * - * For the ring buffer, 64 bit required operations for the time is - * the following: - * - * - Reads may fail if it interrupted a modification of the time stamp. - * It will succeed if it did not interrupt another write even if - * the read itself is interrupted by a write. - * It returns whether it was successful or not. - * - * - Writes always succeed and will overwrite other writes and writes - * that were done by events interrupting the current write. - * - * - A write followed by a read of the same time stamp will always succeed, - * but may not contain the same value. - * - * - A cmpxchg will fail if it interrupted another write or cmpxchg. - * Other than that, it acts like a normal cmpxchg. - * - * The 60 bit time stamp is broken up by 30 bits in a top and bottom half - * (bottom being the least significant 30 bits of the 60 bit time stamp). - * - * The two most significant bits of each half holds a 2 bit counter (0-3). - * Each update will increment this counter by one. - * When reading the top and bottom, if the two counter bits match then the - * top and bottom together make a valid 60 bit number. - */ -#define RB_TIME_SHIFT 30 -#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) -#define RB_TIME_MSB_SHIFT 60 - -static inline int rb_time_cnt(unsigned long val) +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { - return (val >> RB_TIME_SHIFT) & 3; -} - -static inline u64 rb_time_val(unsigned long top, unsigned long bottom) -{ - u64 val; - - val = top & RB_TIME_VAL_MASK; - val <<= RB_TIME_SHIFT; - val |= bottom & RB_TIME_VAL_MASK; - - return val; -} - -static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) -{ - unsigned long top, bottom, msb; - unsigned long c; - - /* - * If the read is interrupted by a write, then the cnt will - * be different. Loop until both top and bottom have been read - * without interruption. - */ - do { - c = local_read(&t->cnt); - top = local_read(&t->top); - bottom = local_read(&t->bottom); - msb = local_read(&t->msb); - } while (c != local_read(&t->cnt)); - - *cnt = rb_time_cnt(top); - - /* If top, msb or bottom counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) - return false; - - /* The shift to msb will lose its cnt bits */ - *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); - return true; -} - -static bool rb_time_read(rb_time_t *t, u64 *ret) -{ - unsigned long cnt; - - return __rb_time_read(t, ret, &cnt); -} - -static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) -{ - return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); -} - -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, - unsigned long *msb) -{ - *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); - *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); - *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); -} + struct buffer_data_page field; -static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) -{ - val = rb_time_val_cnt(val, cnt); - local_set(t, val); -} + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); -static void rb_time_set(rb_time_t *t, u64 val) -{ - unsigned long cnt, top, bottom, msb; + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); - rb_time_split(val, &top, &bottom, &msb); + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); - /* Writes always succeed with a valid number even if it gets interrupted. */ - do { - cnt = local_inc_return(&t->cnt); - rb_time_val_set(&t->top, top, cnt); - rb_time_val_set(&t->bottom, bottom, cnt); - rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); - } while (cnt != local_read(&t->cnt)); -} + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)buffer->subbuf_size, + (unsigned int)is_signed_type(char)); -static inline bool -rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) -{ - return local_try_cmpxchg(l, &expect, set); + return !trace_seq_has_overflowed(s); } -#else /* 64 bits */ - -/* local64_t always succeeds */ - -static inline bool rb_time_read(rb_time_t *t, u64 *ret) +static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); - return true; } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } -#endif /* * Enable this to make sure that the event passed to @@ -820,10 +676,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, WARN_ONCE(1, "nest (%d) greater than max", nest); fail: - /* Can only fail on 32 bit */ - if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) - /* Screw it, just read the current time */ - ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } @@ -1619,10 +1472,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; bpage->page = page_address(page); + bpage->order = cpu_buffer->buffer->subbuf_order; rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) @@ -1701,7 +1556,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -1784,7 +1640,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + /* Default buffer page size - one system page */ + buffer->subbuf_order = 0; + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + + /* Max payload is buffer page size - header (8bytes) */ + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); + + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -2103,7 +1966,7 @@ static void update_pages_handler(struct work_struct *work) * @size: the new size. * @cpu_id: the cpu buffer to resize * - * Minimum size is 2 * BUF_PAGE_SIZE. + * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ @@ -2125,7 +1988,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) @@ -2372,7 +2235,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ barrier(); - if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) + if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; @@ -2412,11 +2275,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) } static __always_inline unsigned -rb_event_index(struct ring_buffer_event *event) +rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; + addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; + + return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) @@ -2605,6 +2470,7 @@ static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; @@ -2613,13 +2479,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ - if (tail >= BUF_PAGE_SIZE) { + if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ - if (tail == BUF_PAGE_SIZE) + if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); @@ -2647,7 +2513,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * If we are less than the minimum size, we don't need to * worry about it. */ - if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ @@ -2662,19 +2528,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } /* Put in a discarded event */ - event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + local_add(bsize - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ - length = (tail + length) - BUF_PAGE_SIZE; + length = (tail + length) - bsize; local_sub(length, &tail_page->write); } @@ -2788,7 +2654,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path */ static struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; @@ -2796,7 +2663,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ - if (abs || rb_event_index(event)) { + if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2826,7 +2693,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -2870,7 +2737,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, if (!abs) info->delta = 0; } - *event = rb_add_time_stamp(*event, info->delta, abs); + *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } @@ -2954,10 +2821,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage; unsigned long addr; - new_index = rb_event_index(event); + new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); @@ -3344,6 +3211,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS + +static const char *show_irq_str(int bits) +{ + const char *type[] = { + ".", // 0 + "s", // 1 + "h", // 2 + "Hs", // 3 + "n", // 4 + "Ns", // 5 + "Nh", // 6 + "NHs", // 7 + }; + + return type[bits]; +} + +/* Assume this is an trace event */ +static const char *show_flags(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + int bits = 0; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return "X"; + + entry = ring_buffer_event_data(event); + + if (entry->flags & TRACE_FLAG_SOFTIRQ) + bits |= 1; + + if (entry->flags & TRACE_FLAG_HARDIRQ) + bits |= 2; + + if (entry->flags & TRACE_FLAG_NMI) + bits |= 4; + + return show_irq_str(bits); +} + +static const char *show_irq(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return ""; + + entry = ring_buffer_event_data(event); + if (entry->flags & TRACE_FLAG_IRQS_OFF) + return "d"; + return ""; +} + +static const char *show_interrupt_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + if (pc & SOFTIRQ_OFFSET) + level |= 1; + + if (pc & HARDIRQ_MASK) + level |= 2; + + if (pc & NMI_MASK) + level |= 4; + + return show_irq_str(level); +} + static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) @@ -3364,34 +3301,57 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; - pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", + e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); - pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", + e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; - pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", + e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; - pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", + e, ts, event->time_delta, + show_flags(event), show_irq(event)); break; default: break; } } + pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; +#define buffer_warn_return(fmt, ...) \ + do { \ + /* If another report is happening, ignore this one */ \ + if (atomic_inc_return(&ts_dump) != 1) { \ + atomic_dec(&ts_dump); \ + goto out; \ + } \ + atomic_inc(&cpu_buffer->record_disabled); \ + pr_warn(fmt, ##__VA_ARGS__); \ + dump_buffer_page(bpage, info, tail); \ + atomic_dec(&ts_dump); \ + /* There's some cases in boot up that this can happen */ \ + if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ + /* Do not re-enable checking */ \ + return; \ + } while (0) + /* * Check if the current event time stamp matches the deltas on * the buffer page. @@ -3445,7 +3405,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = rb_fix_abs_ts(delta, ts); + delta = rb_fix_abs_ts(delta, ts); + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + } + ts = delta; break; case RINGBUF_TYPE_PADDING: @@ -3462,23 +3427,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - /* If another report is happening, ignore this one */ - if (atomic_inc_return(&ts_dump) != 1) { - atomic_dec(&ts_dump); - goto out; - } - atomic_inc(&cpu_buffer->record_disabled); - /* There's some cases in boot up that this can happen */ - WARN_ON_ONCE(system_state != SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", - cpu_buffer->cpu, - ts + info->delta, info->ts, info->delta, - info->before, info->after, - full ? " (full)" : ""); - dump_buffer_page(bpage, info, tail); - atomic_dec(&ts_dump); - /* Do not re-enable checking */ - return; + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); @@ -3498,16 +3451,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - bool a_ok; - bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + rb_time_read(&cpu_buffer->before_stamp, &info->before); + rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -3522,7 +3473,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; - } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { + } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3544,7 +3495,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - info->length; /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) { + if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } @@ -3571,8 +3522,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ - a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - RB_WARN_ON(cpu_buffer, !a_ok); + rb_time_read(&cpu_buffer->before_stamp, &info->before); /* * Read a new timestamp and update the before_stamp to make @@ -3584,9 +3534,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); - /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ - RB_WARN_ON(cpu_buffer, !a_ok); + /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { @@ -3678,7 +3626,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; - if (info.length > BUF_MAX_DATA_SIZE) + if (info.length > cpu_buffer->buffer->max_data_size) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; @@ -3753,7 +3701,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (unlikely(length > BUF_MAX_DATA_SIZE)) + if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -3787,7 +3735,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { @@ -3903,7 +3851,7 @@ int ring_buffer_write(struct trace_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) |