diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-07 18:29:15 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-07 18:29:15 -0700 |
commit | 32663c78c10f80df90b832de0428a6cb98a64e9a (patch) | |
tree | f7b4518ee7c8d84d91320461763181fc35698e24 | |
parent | 7b9de97711225559af213dc52b6ea883ef1ea7a8 (diff) | |
parent | 38ce2a9e33db61a3041840310077072d6210ead4 (diff) | |
download | linux-32663c78c10f80df90b832de0428a6cb98a64e9a.tar.gz linux-32663c78c10f80df90b832de0428a6cb98a64e9a.tar.bz2 linux-32663c78c10f80df90b832de0428a6cb98a64e9a.zip |
Merge tag 'trace-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace
Pull tracing updates from Steven Rostedt:
- The biggest news in that the tracing ring buffer can now time events
that interrupted other ring buffer events.
Before this change, if an interrupt came in while recording another
event, and that interrupt also had an event, those events would all
have the same time stamp as the event it interrupted.
Now, with the new design, those events will have a unique time stamp
and rightfully display the time for those events that were recorded
while interrupting another event.
- Bootconfig how has an "override" operator that lets the users have a
default config, but then add options to override the default.
- A fix was made to properly filter function graph tracing to the
ftrace PIDs. This came in at the end of the -rc cycle, and needs to
be backported.
- Several clean ups, performance updates, and minor fixes as well.
* tag 'trace-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace: (39 commits)
tracing: Add trace_array_init_printk() to initialize instance trace_printk() buffers
kprobes: Fix compiler warning for !CONFIG_KPROBES_ON_FTRACE
tracing: Use trace_sched_process_free() instead of exit() for pid tracing
bootconfig: Fix to find the initargs correctly
Documentation: bootconfig: Add bootconfig override operator
tools/bootconfig: Add testcases for value override operator
lib/bootconfig: Add override operator support
kprobes: Remove show_registers() function prototype
tracing/uprobe: Remove dead code in trace_uprobe_register()
kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler
ftrace: Fix ftrace_trace_task return value
tracepoint: Use __used attribute definitions from compiler_attributes.h
tracepoint: Mark __tracepoint_string's __used
trace : Have tracing buffer info use kvzalloc instead of kzalloc
tracing: Remove outdated comment in stack handling
ftrace: Do not let direct or IPMODIFY ftrace_ops be added to module and set trampolines
ftrace: Setup correct FTRACE_FL_REGS flags for module
tracing/hwlat: Honor the tracing_cpumask
tracing/hwlat: Drop the duplicate assignment in start_kthread()
tracing: Save one trace_event->type by using __TRACE_LAST_TYPE
...
-rw-r--r-- | Documentation/admin-guide/bootconfig.rst | 11 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace_64.S | 29 | ||||
-rw-r--r-- | include/linux/kprobes.h | 1 | ||||
-rw-r--r-- | include/linux/ring_buffer.h | 1 | ||||
-rw-r--r-- | include/linux/trace.h | 1 | ||||
-rw-r--r-- | include/linux/tracepoint.h | 11 | ||||
-rw-r--r-- | include/trace/trace_events.h | 19 | ||||
-rw-r--r-- | init/main.c | 14 | ||||
-rw-r--r-- | kernel/kprobes.c | 24 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 34 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 694 | ||||
-rw-r--r-- | kernel/trace/trace.c | 80 | ||||
-rw-r--r-- | kernel/trace/trace.h | 9 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_hwlat.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 14 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 1 | ||||
-rw-r--r-- | lib/bootconfig.c | 33 | ||||
-rw-r--r-- | tools/bootconfig/samples/bad-override.bconf | 3 | ||||
-rw-r--r-- | tools/bootconfig/samples/bad-override2.bconf | 3 | ||||
-rw-r--r-- | tools/bootconfig/samples/good-override.bconf | 6 | ||||
-rwxr-xr-x | tools/bootconfig/test-bootconfig.sh | 13 |
23 files changed, 775 insertions, 250 deletions
diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst index d6b3b77a4129..a22024f9175e 100644 --- a/Documentation/admin-guide/bootconfig.rst +++ b/Documentation/admin-guide/bootconfig.rst @@ -71,6 +71,16 @@ For example,:: foo = bar, baz foo = qux # !ERROR! we can not re-define same key +If you want to update the value, you must use the override operator +``:=`` explicitly. For example:: + + foo = bar, baz + foo := qux + +then, the ``qux`` is assigned to ``foo`` key. This is useful for +overriding the default value by adding (partial) custom bootconfigs +without parsing the default bootconfig. + If you want to append the value to existing key as an array member, you can use ``+=`` operator. For example:: @@ -84,6 +94,7 @@ For example, following config is NOT allowed.:: foo = value1 foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist + foo.bar := value2 # !ERROR! even with the override operator, this is NOT allowed. Comments diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 51504566b3a6..7edbd5ee5ed4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -286,6 +286,7 @@ extern void ftrace_regs_caller_ret(void); extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); +extern void ftrace_regs_caller_jmp(void); /* movq function_trace_op(%rip), %rdx */ /* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */ @@ -316,6 +317,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long end_offset; unsigned long op_offset; unsigned long call_offset; + unsigned long jmp_offset; unsigned long offset; unsigned long npages; unsigned long size; @@ -333,11 +335,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) end_offset = (unsigned long)ftrace_regs_caller_end; op_offset = (unsigned long)ftrace_regs_caller_op_ptr; call_offset = (unsigned long)ftrace_regs_call; + jmp_offset = (unsigned long)ftrace_regs_caller_jmp; } else { start_offset = (unsigned long)ftrace_caller; end_offset = (unsigned long)ftrace_caller_end; op_offset = (unsigned long)ftrace_caller_op_ptr; call_offset = (unsigned long)ftrace_call; + jmp_offset = 0; } size = end_offset - start_offset; @@ -367,10 +371,14 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) if (WARN_ON(ret < 0)) goto fail; + /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { - ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); - ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); - if (WARN_ON(ret < 0)) + /* NOP the jnz 1f; but make sure it's a 2 byte jnz */ + ip = trampoline + (jmp_offset - start_offset); + if (WARN_ON(*(char *)ip != 0x75)) + goto fail; + ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2); + if (ret < 0) goto fail; } diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 083a3da7bb73..ac3d5f22fe64 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -241,22 +241,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) */ movq ORIG_RAX(%rsp), %rax testq %rax, %rax - jz 1f +SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) + jnz 1f - /* Swap the flags with orig_rax */ - movq MCOUNT_REG_SIZE(%rsp), %rdi - movq %rdi, MCOUNT_REG_SIZE-8(%rsp) - movq %rax, MCOUNT_REG_SIZE(%rsp) - - restore_mcount_regs 8 - /* Restore flags */ - popfq - -SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); - UNWIND_HINT_RET_OFFSET - jmp ftrace_epilogue - -1: restore_mcount_regs + restore_mcount_regs /* Restore flags */ popfq @@ -269,6 +257,17 @@ SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue + /* Swap the flags with orig_rax */ +1: movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + /* Restore flags */ + popfq + UNWIND_HINT_RET_OFFSET + jmp ftrace_epilogue + SYM_FUNC_END(ftrace_regs_caller) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 45b8cdc9fad7..9be1bff4f586 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -227,7 +227,6 @@ extern int arch_prepare_kprobe(struct kprobe *p); extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); -extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); extern int arch_populate_kprobe_blacklist(void); diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index c76b2f3b3ac4..136ea0997e6d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -143,6 +143,7 @@ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter); unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu); void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu); +void ring_buffer_reset_online_cpus(struct trace_buffer *buffer); void ring_buffer_reset(struct trace_buffer *buffer); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP diff --git a/include/linux/trace.h b/include/linux/trace.h index 7fd86d3c691f..36d255d66f88 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -29,6 +29,7 @@ struct trace_array; void trace_printk_init_buffers(void); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index a1fecf311621..598fec9f9dbf 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -116,8 +116,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __TRACEPOINT_ENTRY(name) \ static tracepoint_ptr_t __tracepoint_ptr_##name __used \ - __attribute__((section("__tracepoints_ptrs"))) = \ - &__tracepoint_##name + __section(__tracepoints_ptrs) = &__tracepoint_##name #endif #endif /* _LINUX_TRACEPOINT_H */ @@ -280,9 +279,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) */ #define DEFINE_TRACE_FN(name, reg, unreg) \ static const char __tpstrtab_##name[] \ - __attribute__((section("__tracepoints_strings"))) = #name; \ - struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"), used)) = \ + __section(__tracepoints_strings) = #name; \ + struct tracepoint __tracepoint_##name __used \ + __section(__tracepoints) = \ { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ __TRACEPOINT_ENTRY(name); @@ -361,7 +360,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static const char *___tp_str __tracepoint_string = str; \ ___tp_str; \ }) -#define __tracepoint_string __attribute__((section("__tracepoint_str"))) +#define __tracepoint_string __used __section(__tracepoint_str) #else /* * tracepoint_string() is used to save the string address for userspace diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 502c7be50b8d..1bc3e7bba9a4 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -210,8 +210,7 @@ TRACE_MAKE_SYSTEM_STR(); #define DEFINE_EVENT(template, name, proto, args) #undef DEFINE_EVENT_PRINT -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ - DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) +#define DEFINE_EVENT_PRINT(template, name, proto, args, print) #undef TRACE_EVENT_FLAGS #define TRACE_EVENT_FLAGS(event, flag) @@ -443,12 +442,8 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ tstruct \ {} }; -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, name, proto, args) - #undef DEFINE_EVENT_PRINT -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ - DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) +#define DEFINE_EVENT_PRINT(template, name, proto, args, print) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) @@ -523,13 +518,6 @@ static inline notrace int trace_event_get_offsets_##call( \ return __data_size; \ } -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, name, proto, args) - -#undef DEFINE_EVENT_PRINT -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ - DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) - #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* @@ -721,9 +709,6 @@ static inline void ftrace_test_probe_##call(void) \ check_trace_callback_type_##call(trace_event_raw_event_##template); \ } -#undef DEFINE_EVENT_PRINT -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) - #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) #undef __entry diff --git a/init/main.c b/init/main.c index de2f9fa6bb4a..ae78fb68d231 100644 --- a/init/main.c +++ b/init/main.c @@ -388,8 +388,6 @@ static int __init bootconfig_params(char *param, char *val, { if (strcmp(param, "bootconfig") == 0) { bootconfig_found = true; - } else if (strcmp(param, "--") == 0) { - initargs_found = true; } return 0; } @@ -400,19 +398,23 @@ static void __init setup_boot_config(const char *cmdline) const char *msg; int pos; u32 size, csum; - char *data, *copy; + char *data, *copy, *err; int ret; /* Cut out the bootconfig data even if we have no bootconfig option */ data = get_boot_config_from_initrd(&size, &csum); strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, - bootconfig_params); + err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, + bootconfig_params); - if (!bootconfig_found) + if (IS_ERR(err) || !bootconfig_found) return; + /* parse_args() stops at '--' and returns an address */ + if (err) + initargs_found = true; + if (!data) { pr_err("'bootconfig' found on command line, but no bootconfig found\n"); return; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e87679a48ba2..287b263c9cb9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1111,9 +1111,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ -#define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) (-ENODEV) -#define disarm_kprobe_ftrace(p) (-ENODEV) +static inline int prepare_kprobe(struct kprobe *p) +{ + return arch_prepare_kprobe(p); +} + +static inline int arm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} + +static inline int disarm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} #endif /* Arm a kprobe with text_mutex */ @@ -2145,6 +2156,13 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); + + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace. + */ + if (kprobe_ftrace(p)) + disarm_kprobe_ftrace(p); } /* Disable one kprobe */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 72064541bef2..275441254bb5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -#define FTRACE_PID_IGNORE -1 -#define FTRACE_PID_TRACE -2 - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -2388,6 +2385,14 @@ struct ftrace_ops direct_ops = { .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, + /* + * By declaring the main trampoline as this trampoline + * it will never have one allocated for it. Allocated + * trampolines should not call direct functions. + * The direct_ops should only be called by the builtin + * ftrace_regs_caller trampoline. + */ + .trampoline = FTRACE_REGS_ADDR, }; #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ @@ -6255,8 +6260,19 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; + if (ops_references_rec(ops, rec)) { + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + continue; + cnt++; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + if (cnt == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; + } } return cnt; @@ -6435,8 +6451,8 @@ void ftrace_module_enable(struct module *mod) if (ftrace_start_up) cnt += referenced_filters(rec); - /* This clears FTRACE_FL_DISABLED */ - rec->flags = cnt; + rec->flags &= ~FTRACE_FL_DISABLED; + rec->flags += cnt; if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(rec, 1); @@ -7066,12 +7082,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } else { unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f15471ce969e..93ef0ab6ea20 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) +#define for_each_online_buffer_cpu(buffer, cpu) \ + for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) + #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) @@ -413,12 +416,27 @@ struct rb_irq_work { struct rb_event_info { u64 ts; u64 delta; + u64 before; + u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; }; /* + * Used for the add_timestamp + * NONE + * EXTEND - wants a time extend + * ABSOLUTE - the buffer requests all events to have absolute time stamps + * FORCE - force a full time stamp. + */ +enum { + RB_ADD_STAMP_NONE = 0, + RB_ADD_STAMP_EXTEND = BIT(1), + RB_ADD_STAMP_ABSOLUTE = BIT(2), + RB_ADD_STAMP_FORCE = BIT(3) +}; +/* * Used for which event context the event is in. * NMI = 0 * IRQ = 1 @@ -435,6 +453,28 @@ enum { RB_CTX_MAX }; +#if BITS_PER_LONG == 32 +#define RB_TIME_32 +#endif + +/* To test on 64 bit machines */ +//#define RB_TIME_32 + +#ifdef RB_TIME_32 + +struct rb_time_struct { + local_t cnt; + local_t top; + local_t bottom; +}; +#else +#include <asm/local64.h> +struct rb_time_struct { + local64_t time; +}; +#endif +typedef struct rb_time_struct rb_time_t; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -470,7 +510,8 @@ struct ring_buffer_per_cpu { size_t shortest_full; unsigned long read; unsigned long read_bytes; - u64 write_stamp; + rb_time_t write_stamp; + rb_time_t before_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -513,6 +554,189 @@ struct ring_buffer_iter { int missed_events; }; +#ifdef RB_TIME_32 + +/* + * On 32 bit machines, local64_t is very expensive. As the ring + * buffer doesn't need all the features of a true 64 bit atomic, + * on 32 bit, it uses these functions (64 still uses local64_t). + * + * For the ring buffer, 64 bit required operations for the time is + * the following: + * + * - Only need 59 bits (uses 60 to make it even). + * - Reads may fail if it interrupted a modification of the time stamp. + * It will succeed if it did not interrupt another write even if + * the read itself is interrupted by a write. + * It returns whether it was successful or not. + * + * - Writes always succeed and will overwrite other writes and writes + * that were done by events interrupting the current write. + * + * - A write followed by a read of the same time stamp will always succeed, + * but may not contain the same value. + * + * - A cmpxchg will fail if it interrupted another write or cmpxchg. + * Other than that, it acts like a normal cmpxchg. + * + * The 60 bit time stamp is broken up by 30 bits in a top and bottom half + * (bottom being the least significant 30 bits of the 60 bit time stamp). + * + * The two most significant bits of each half holds a 2 bit counter (0-3). + * Each update will increment this counter by one. + * When reading the top and bottom, if the two counter bits match then the + * top and bottom together make a valid 60 bit number. + */ +#define RB_TIME_SHIFT 30 +#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) + +static inline int rb_time_cnt(unsigned long val) +{ + return (val >> RB_TIME_SHIFT) & 3; +} + +static inline u64 rb_time_val(unsigned long top, unsigned long bottom) +{ + u64 val; + + val = top & RB_TIME_VAL_MASK; + val <<= RB_TIME_SHIFT; + val |= bottom & RB_TIME_VAL_MASK; + + return val; +} + +static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) +{ + unsigned long top, bottom; + unsigned long c; + + /* + * If the read is interrupted by a write, then the cnt will + * be different. Loop until both top and bottom have been read + * without interruption. + */ + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top and bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(bottom)) + return false; + + *ret = rb_time_val(top, bottom); + return true; +} + +static bool rb_time_read(rb_time_t *t, u64 *ret) +{ + unsigned long cnt; + + return __rb_time_read(t, ret, &cnt); +} + +static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) +{ + return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); +} + +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +{ + *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); + *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); +} + +static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) +{ + val = rb_time_val_cnt(val, cnt); + local_set(t, val); +} + +static void rb_time_set(rb_time_t *t, u64 val) +{ + unsigned long cnt, top, bottom; + + rb_time_split(val, &top, &bottom); + + /* Writes always succeed with a valid number even if it gets interrupted. */ + do { + cnt = local_inc_return(&t->cnt); + rb_time_val_set(&t->top, top, cnt); + rb_time_val_set(&t->bottom, bottom, cnt); + } while (cnt != local_read(&t->cnt)); +} + +static inline bool +rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) +{ + unsigned long ret; + + ret = local_cmpxchg(l, expect, set); + return ret == expect; +} + +static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + unsigned long cnt, top, bottom; + unsigned long cnt2, top2, bottom2; + u64 val; + + /* The cmpxchg always fails if it interrupted an update */ + if (!__rb_time_read(t, &val, &cnt2)) + return false; + + if (val != expect) + return false; + + cnt = local_read(&t->cnt); + if ((cnt & 3) != cnt2) + return false; + + cnt2 = cnt + 1; + + rb_time_split(val, &top, &bottom); + top = rb_time_val_cnt(top, cnt); + bottom = rb_time_val_cnt(bottom, cnt); + + rb_time_split(set, &top2, &bottom2); + top2 = rb_time_val_cnt(top2, cnt2); + bottom2 = rb_time_val_cnt(bottom2, cnt2); + + if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) + return false; + if (!rb_time_read_cmpxchg(&t->top, top, top2)) + return false; + if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) + return false; + return true; +} + +#else /* 64 bits */ + +/* local64_t always succeeds */ + +static inline bool rb_time_read(rb_time_t *t, u64 *ret) +{ + *ret = local64_read(&t->time); + return true; +} +static void rb_time_set(rb_time_t *t, u64 val) +{ + local64_set(&t->time, val); +} + +static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + u64 val; + val = local64_cmpxchg(&t->time, expect, set); + return val == expect; +} +#endif + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from @@ -746,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, static inline u64 rb_time_stamp(struct trace_buffer *buffer) { + u64 ts; + + /* Skip retpolines :-( */ + if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) + ts = trace_clock_local(); + else + ts = buffer->clock(); + /* shift to debug/test normalization and TIME_EXTENTS */ - return buffer->clock() << DEBUG_SHIFT; + return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) @@ -2372,8 +2604,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * +/* Slow path */ +static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) @@ -2397,6 +2629,66 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static void +rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + u64 write_stamp; + + WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)info->before, + (unsigned long long)info->after, + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); +} + +static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event **event, + struct rb_event_info *info, + u64 *delta, + unsigned int *length) +{ + bool abs = info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); + + if (unlikely(info->delta > (1ULL << 59))) { + /* did the clock go backwards */ + if (info->before == info->after && info->before > info->ts) { + /* not interrupted */ + static int once; + + /* + * This is possible with a recalibrating of the TSC. + * Do not produce a call stack, but just report it. + */ + if (!once) { + once++; + pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", + info->before, info->ts); + } + } else + rb_check_timestamp(cpu_buffer, info); + if (!abs) + info->delta = 0; + } + *event = rb_add_time_stamp(*event, info->delta, abs); + *length -= RB_LEN_TIME_EXTEND; + *delta = 0; +} + /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event @@ -2416,21 +2708,12 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ - if (unlikely(info->add_timestamp)) { - bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); - - event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); - length -= RB_LEN_TIME_EXTEND; - delta = 0; |