From dd17c8f72993f9461e9c19250e3f155d6d99df22 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: remove per_cpu__ prefix. Now that the return from alloc_percpu is compatible with the address of per-cpu vars, it makes sense to hand around the address of per-cpu variables. To make this sane, we remove the per_cpu__ prefix we used created to stop people accidentally using these vars directly. Now we have sparse, we can use that (next patch). tj: * Updated to convert stuff which were missed by or added after the original patch. * Kill per_cpu_var() macro. Signed-off-by: Rusty Russell Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- kernel/rcutorture.c | 8 ++++---- kernel/trace/trace.c | 6 +++--- kernel/trace/trace_functions_graph.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 178967b6434e..e339ab349121 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -731,13 +731,13 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); + __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); + __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); } @@ -786,13 +786,13 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); + __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); + __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); schedule(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85a5ed70b5b2..b808177af816 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -91,12 +91,12 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(ftrace_cpu_disabled); } static inline void ftrace_enable_cpu(void) { - __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(ftrace_cpu_disabled); preempt_enable(); } @@ -1085,7 +1085,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 90a6daa10962..8614e3241ff8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -176,7 +176,7 @@ static int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return 0; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -240,7 +240,7 @@ static void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, -- cgit v1.2.3 From f409adf5b1db55ece7e80b67a944f9c0d3fe93e9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Dec 2009 14:02:00 +0100 Subject: futex: Protect pid lookup in compat code with RCU find_task_by_vpid() in compat_sys_get_robust_list() does not require tasklist_lock. It can be protected with rcu_read_lock as done in sys_get_robust_list() already. Signed-off-by: Thomas Gleixner Cc: Darren Hart Cc: Peter Zijlstra --- kernel/futex_compat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 235716556bf1..d49afb2395e5 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, struct task_struct *p; ret = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_vpid(pid); if (!p) goto err_unlock; @@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (put_user(sizeof(*head), len_ptr)) @@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, return put_user(ptr_to_compat(head), head_ptr); err_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } -- cgit v1.2.3 From 86fc80f16e8a2449d5827bf1a9838b7fd9f70097 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 17:13:31 +0100 Subject: capabilities: Use RCU to protect task lookup in sys_capget cap_get_target_pid() protects the task lookup with tasklist_lock. security_capget() is called under tasklist_lock as well but tasklist_lock does not protect anything there. The capabilities are protected by RCU already. So tasklist_lock only protects the lookup and prevents the task going away, which can be done with rcu_read_lock() as well. Signed-off-by: Thomas Gleixner Signed-off-by: James Morris --- kernel/capability.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 7f876e60521f..9e4697e9b276 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, if (pid && (pid != task_pid_vnr(current))) { struct task_struct *target; - read_lock(&tasklist_lock); + rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) @@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, else ret = security_capget(target, pEp, pIp, pPp); - read_unlock(&tasklist_lock); + rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); -- cgit v1.2.3 From 49f474331e563a6ecf3b1e87ec27ec5482b3e4f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 27 Dec 2009 11:51:52 +0100 Subject: perf events: Remove arg from perf sched hooks Since we only ever schedule the local cpu, there is no need to pass the cpu number to the perf sched hooks. This micro-optimizes things a bit. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 27 ++++++++++++++------------- kernel/sched.c | 6 +++--- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061398d1..099bd662daa6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1170,9 +1170,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, * not restart the event. */ void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; @@ -1252,8 +1252,9 @@ static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) static void __perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { + int cpu = smp_processor_id(); struct perf_event *event; int can_add_hw = 1; @@ -1326,24 +1327,24 @@ __perf_event_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; if (cpuctx->task_ctx == ctx) return; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); cpuctx->task_ctx = ctx; } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); } #define MAX_INTERRUPTS (~0ULL) @@ -1461,7 +1462,7 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; @@ -1469,7 +1470,7 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (!atomic_read(&nr_events)) return; - cpuctx = &per_cpu(perf_cpu_context, cpu); + cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; perf_ctx_adjust_freq(&cpuctx->ctx); @@ -1484,9 +1485,9 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx, cpu); + perf_event_cpu_sched_in(cpuctx); if (ctx) - perf_event_task_sched_in(curr, cpu); + perf_event_task_sched_in(curr); } /* @@ -1527,7 +1528,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task, smp_processor_id()); + perf_event_task_sched_in(task); out: local_irq_restore(flags); } diff --git a/kernel/sched.c b/kernel/sched.c index 18cceeecce35..d6527ac0f6e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2752,7 +2752,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_event_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -5266,7 +5266,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr, cpu); + perf_event_task_tick(curr); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5480,7 +5480,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From 07b139c8c81b97bbe55c68daf0cbeca8b1c609ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 21 Dec 2009 14:27:35 +0800 Subject: perf events: Remove CONFIG_EVENT_PROFILE Quoted from Ingo: | This reminds me - i think we should eliminate CONFIG_EVENT_PROFILE - | it's an unnecessary Kconfig complication. If both PERF_EVENTS and | EVENT_TRACING is enabled we should expose generic tracepoints. | | Nor is it limited to event 'profiling', so it has become a misnomer as | well. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B2F1557.2050705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++-- kernel/trace/Makefile | 4 +++- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_kprobe.c | 14 +++++++------- kernel/trace/trace_syscalls.c | 5 ++--- 5 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 099bd662daa6..5b987b4a98a8 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4177,7 +4177,7 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) @@ -4282,7 +4282,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT static void bp_perf_event_destroy(struct perf_event *event) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cd9ecd89ec77..d00c6fe23f54 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -51,7 +51,9 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o -obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228de..74563d7e102e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1360,7 +1360,7 @@ out_unlock: return err; } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS void ftrace_profile_free_filter(struct perf_event *event) { @@ -1428,5 +1428,5 @@ out_unlock: return err; } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 375f81a568dc..75d75dec226a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1249,7 +1249,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, ", REC->" FIELD_STRING_RETIP); } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ static __kprobes int kprobe_profile_func(struct kprobe *kp, @@ -1407,7 +1407,7 @@ static void probe_profile_disable(struct ftrace_event_call *call) disable_kprobe(&tp->rp.kp); } } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ static __kprobes @@ -1417,10 +1417,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kprobe_trace_func(kp, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kprobe_profile_func(kp, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1431,10 +1431,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kretprobe_trace_func(ri, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kretprobe_profile_func(ri, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1463,7 +1463,7 @@ static int register_probe_event(struct trace_probe *tp) call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS call->profile_enable = probe_profile_enable; call->profile_disable = probe_profile_disable; #endif diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 75289f372dd2..f694f66d75b0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -421,7 +421,7 @@ int __init init_ftrace_syscalls(void) } core_initcall(init_ftrace_syscalls); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); @@ -626,6 +626,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -#endif - +#endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From d894837f23f491aa7ed167aae767fc07cfe6e6e6 Mon Sep 17 00:00:00 2001 From: Simon Kagstrom Date: Wed, 23 Dec 2009 11:08:18 +0100 Subject: sched: might_sleep(): Make file parameter const char * Fixes a warning when building with g++: warning: deprecated conversion from string constant to 'char*' And the file parameter use is constant, so mark it as such. Signed-off-by: Simon Kagstrom Cc: peterz@infradead.org LKML-Reference: <20091223110818.442d848e@marrow.netinsight.se> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c535cc4f6428..64298a52eaa6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9694,7 +9694,7 @@ static inline int preempt_count_equals(int preempt_offset) return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -void __might_sleep(char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ -- cgit v1.2.3 From e1783a240f491fb233f04edc042e16b18a7a79ba Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: module: Use this_cpu_xx to dynamically allocate counters Use cpu ops to deal with the per cpu data instead of a local_t. Reduces memory requirements, cache footprint and decreases cycle counts. The this_cpu_xx operations are also used for !SMP mode. Otherwise we could not drop the use of __module_ref_addr() which would make per cpu data handling complicated. this_cpu_xx operations have their own fallback for !SMP. V8-V9: - Leave include asm/module.h since ringbuffer.c depends on it. Nothing else does though. Another patch will deal with that. - Remove spurious free. Signed-off-by: Christoph Lameter Acked-by: Rusty Russell Signed-off-by: Tejun Heo --- kernel/module.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e96b8ed1cb6a..9bf228052ec5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->modules_which_use_me); for_each_possible_cpu(cpu) - local_set(__module_ref_addr(mod, cpu), 0); + per_cpu_ptr(mod->refptr, cpu)->count = 0; + /* Hold reference count during initialization. */ - local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); + __this_cpu_write(mod->refptr->count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod) int cpu; for_each_possible_cpu(cpu) - total += local_read(__module_ref_addr(mod, cpu)); + total += per_cpu_ptr(mod->refptr, cpu)->count; return total; } EXPORT_SYMBOL(module_refcount); @@ -796,14 +797,15 @@ static struct module_attribute refcnt = { void module_put(struct module *module) { if (module) { - unsigned int cpu = get_cpu(); - local_dec(__module_ref_addr(module, cpu)); + preempt_disable(); + __this_cpu_dec(module->refptr->count); + trace_module_put(module, _RET_IP_, - local_read(__module_ref_addr(module, cpu))); + __this_cpu_read(module->refptr->count)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); - put_cpu(); + preempt_enable(); } } EXPORT_SYMBOL(module_put); @@ -1394,9 +1396,9 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) +#if defined(CONFIG_MODULE_UNLOAD) if (mod->refptr) - percpu_modfree(mod->refptr); + free_percpu(mod->refptr); #endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2159,9 +2161,8 @@ static noinline struct module *load_module(void __user *umod, mod = (void *)sechdrs[modindex].sh_addr; kmemleak_load_module(mod, hdr, sechdrs, secstrings); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), - mod->name); +#if defined(CONFIG_MODULE_UNLOAD) + mod->refptr = alloc_percpu(struct module_ref); if (!mod->refptr) { err = -ENOMEM; goto free_init; @@ -2393,8 +2394,8 @@ static noinline struct module *load_module(void __user *umod, kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - percpu_modfree(mod->refptr); +#if defined(CONFIG_MODULE_UNLOAD) + free_percpu(mod->refptr); free_init: #endif module_free(mod, mod->module_init); -- cgit v1.2.3 From 79615760f380ec86cd58204744e774c33fab9211 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: local_t: Move local.h include to ringbuffer.c and ring_buffer_benchmark.c ringbuffer*.c are the last users of local.h. Remove the include from modules.h and add it to ringbuffer files. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/trace/ring_buffer.c | 1 + kernel/trace/ring_buffer_benchmark.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2326b04c95c4..eb6c8988c31a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -20,6 +20,7 @@ #include #include +#include #include "trace.h" /* diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -8,6 +8,7 @@ #include #include #include +#include struct rb_page { u64 ts; -- cgit v1.2.3 From 16295bec6398a3eedc9377e1af6ff4c71b98c300 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 6 Jan 2010 19:47:10 +1100 Subject: padata: Generic parallelization/serialization interface This patch introduces an interface to process data objects in parallel. The parallelized objects return after serialization in the same order as they were before the parallelization. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/Makefile | 1 + kernel/padata.c | 690 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 691 insertions(+) create mode 100644 kernel/padata.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..6aebdeb2aa34 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o +obj-$(CONFIG_PADATA) += padata.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 000000000000..6f9bcb8313d6 --- /dev/null +++ b/kernel/padata.c @@ -0,0 +1,690 @@ +/* + * padata.c - generic interface to process data streams in parallel + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_OBJ_NUM 10000 * NR_CPUS + +static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) +{ + int cpu, target_cpu; + + target_cpu = cpumask_first(pd->cpumask); + for (cpu = 0; cpu < cpu_index; cpu++) + target_cpu = cpumask_next(target_cpu, pd->cpumask); + + return target_cpu; +} + +static int padata_cpu_hash(struct padata_priv *padata) +{ + int cpu_index; + struct parallel_data *pd; + + pd = padata->pd; + + /* + * Hash the sequence numbers to the cpus by taking + * seq_nr mod. number of cpus in use. + */ + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + + return padata_index_to_cpu(pd, cpu_index); +} + +static void padata_parallel_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + struct padata_instance *pinst; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, pwork); + pd = queue->pd; + pinst = pd->pinst; + + spin_lock(&queue->parallel.lock); + list_replace_init(&queue->parallel.list, &local_list); + spin_unlock(&queue->parallel.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->parallel(padata); + } + + local_bh_enable(); +} + +/* + * padata_do_parallel - padata parallelization function + * + * @pinst: padata instance + * @padata: object to be parallelized + * @cb_cpu: cpu the serialization callback function will run on, + * must be in the cpumask of padata. + * + * The parallelization callback function will run with BHs off. + * Note: Every object which is parallelized by padata_do_parallel + * must be seen by padata_do_serial. + */ +int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu) +{ + int target_cpu, err; + struct padata_queue *queue; + struct parallel_data *pd; + + rcu_read_lock_bh(); + + pd = rcu_dereference(pinst->pd); + + err = 0; + if (!(pinst->flags & PADATA_INIT)) + goto out; + + err = -EBUSY; + if ((pinst->flags & PADATA_RESET)) + goto out; + + if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) + goto out; + + err = -EINVAL; + if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) + goto out; + + err = -EINPROGRESS; + atomic_inc(&pd->refcnt); + padata->pd = pd; + padata->cb_cpu = cb_cpu; + + if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) + atomic_set(&pd->seq_nr, -1); + + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + + target_cpu = padata_cpu_hash(padata); + queue = per_cpu_ptr(pd->queue, target_cpu); + + spin_lock(&queue->parallel.lock); + list_add_tail(&padata->list, &queue->parallel.list); + spin_unlock(&queue->parallel.lock); + + queue_work_on(target_cpu, pinst->wq, &queue->pwork); + +out: + rcu_read_unlock_bh(); + + return err; +} +EXPORT_SYMBOL(padata_do_parallel); + +static struct padata_priv *padata_get_next(struct parallel_data *pd) +{ + int cpu, num_cpus, empty, calc_seq_nr; + int seq_nr, next_nr, overrun, next_overrun; + struct padata_queue *queue, *next_queue; + struct padata_priv *padata; + struct padata_list *reorder; + + empty = 0; + next_nr = -1; + next_overrun = 0; + next_queue = NULL; + + num_cpus = cpumask_weight(pd->cpumask); + + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + reorder = &queue->reorder; + + /* + * Calculate the seq_nr of the object that should be + * next in this queue. + */ + overrun = 0; + calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) + + queue->cpu_index; + + if (unlikely(calc_seq_nr > pd->max_seq_nr)) { + calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; + overrun = 1; + } + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + seq_nr = padata->seq_nr; + BUG_ON(calc_seq_nr != seq_nr); + } else { + seq_nr = calc_seq_nr; + empty++; + } + + if (next_nr < 0 || seq_nr < next_nr + || (next_overrun && !overrun)) { + next_nr = seq_nr; + next_overrun = overrun; + next_queue = queue; + } + } + + padata = NULL; + + if (empty == num_cpus) + goto out; + + reorder = &next_queue->reorder; + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + if (unlikely(next_overrun)) { + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + atomic_set(&queue->num_obj, 0); + } + } + + spin_lock(&reorder->lock); + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + spin_unlock(&reorder->lock); + + atomic_inc(&next_queue->num_obj); + + goto out; + } + + if (next_nr % num_cpus == next_queue->cpu_index) { + padata = ERR_PTR(-ENODATA); + goto out; + } + + padata = ERR_PTR(-EINPROGRESS); +out: + return padata; +} + +static void padata_reorder(struct parallel_data *pd) +{ + struct padata_priv *padata; + struct padata_queue *queue; + struct padata_instance *pinst = pd->pinst; + +try_again: + if (!spin_trylock_bh(&pd->lock)) + goto out; + + while (1) { + padata = padata_get_next(pd); + + if (!padata || PTR_ERR(padata) == -EINPROGRESS) + break; + + if (PTR_ERR(padata) == -ENODATA) { + spin_unlock_bh(&pd->lock); + goto out; + } + + queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + + spin_lock(&queue->serial.lock); + list_add_tail(&padata->list, &queue->serial.list); + spin_unlock(&queue->serial.lock); + + queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + } + + spin_unlock_bh(&pd->lock); + + if (atomic_read(&pd->reorder_objects)) + goto try_again; + +out: + return; +} + +static void padata_serial_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, swork); + pd = queue->pd; + + spin_lock(&queue->serial.lock); + list_replace_init(&queue->serial.list, &local_list); + spin_unlock(&queue->serial.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->serial(padata); + atomic_dec(&pd->refcnt); + } + local_bh_enable(); +} + +/* + * padata_do_serial - padata serialization function + * + * @padata: object to be serialized. + * + * padata_do_serial must be called for every parallelized object. + * The serialization callback function will run with BHs off. + */ +void padata_do_serial(struct padata_priv *padata) +{ + int cpu; + struct padata_queue *queue; + struct parallel_data *pd; + + pd = padata->pd; + + cpu = get_cpu(); + queue = per_cpu_ptr(pd->queue, cpu); + + spin_lock(&queue->reorder.lock); + atomic_inc(&pd->reorder_objects); + list_add_tail(&padata->list, &queue->reorder.list); + spin_unlock(&queue->reorder.lock); + + put_cpu(); + + padata_reorder(pd); +} +EXPORT_SYMBOL(padata_do_serial); + +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + int cpu, cpu_index, num_cpus; + struct padata_queue *queue; + struct parallel_data *pd; + + cpu_index = 0; + + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->queue = alloc_percpu(struct padata_queue); + if (!pd->queue) + goto err_free_pd; + + if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) + goto err_free_queue; + + for_each_possible_cpu(cpu) { + queue = per_cpu_ptr(pd->queue, cpu); + + queue->pd = pd; + + if (cpumask_test_cpu(cpu, cpumask) + && cpumask_test_cpu(cpu, cpu_active_mask)) { + queue->cpu_index = cpu_index; + cpu_index++; + } else + queue->cpu_index = -1; + + INIT_LIST_HEAD(&queue->reorder.list); + INIT_LIST_HEAD(&queue->parallel.list); + INIT_LIST_HEAD(&queue->serial.list); + spin_lock_init(&queue->reorder.lock); + spin_lock_init(&queue->parallel.lock); + spin_lock_init(&queue->serial.lock); + + INIT_WORK(&queue->pwork, padata_parallel_worker); + INIT_WORK(&queue->swork, padata_serial_worker); + atomic_set(&queue->num_obj, 0); + } + + cpumask_and(pd->cpumask, cpumask, cpu_active_mask); + + num_cpus = cpumask_weight(pd->cpumask); + pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + + atomic_set(&pd->seq_nr, -1); + atomic_set(&pd->reorder_objects, 0); + atomic_set(&pd->refcnt, 0); + pd->pinst = pinst; + spin_lock_init(&pd->lock); + + return pd; + +err_free_queue: + free_percpu(pd->queue); +err_free_pd: + kfree(pd); +err: + return NULL; +} + +static void padata_free_pd(struct parallel_data *pd) +{ + free_cpumask_var(pd->cpumask); + free_percpu(pd->queue); + kfree(pd); +} + +static void padata_replace(struct padata_instance *pinst, + struct parallel_data *pd_new) +{ + struct parallel_data *pd_old = pinst->pd; + + pinst->flags |= PADATA_RESET; + + rcu_assign_pointer(pinst->pd, pd_new); + + synchronize_rcu(); + + while (atomic_read(&pd_old->refcnt) != 0) + yield(); + + flush_workqueue(pinst->wq); + + padata_free_pd(pd_old); + + pinst->flags &= ~PADATA_RESET; +} + +/* + * padata_set_cpumask - set the cpumask that padata should use + * + * @pinst: padata instance + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, + cpumask_var_t cpumask) +{ + struct parallel_data *pd; + int err = 0; + + might_sleep(); + + mutex_lock(&pinst->lock); + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) { + err = -ENOMEM; + goto out; + } + + cpumask_copy(pinst->cpumask, cpumask); + + padata_replace(pinst, pd); + +out: + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_set_cpumask); + +static int __padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_active_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_add_cpu - add a cpu to the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to add + */ +int padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_set_cpu(cpu, pinst->cpumask); + err = __padata_add_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_add_cpu); + +static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_online_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_remove_cpu - remove a cpu from the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to remove + */ +int padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_clear_cpu(cpu, pinst->cpumask); + err = __padata_remove_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_remove_cpu); + +/* + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +void padata_start(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags |= PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_start); + +/* + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags &= ~PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int err; + struct padata_instance *pinst; + int cpu = (unsigned long)hcpu; + + pinst = container_of(nfb, struct padata_instance, cpu_notifier); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + } + + return NOTIFY_OK; +} + +/* + * padata_alloc - allocate and initialize a padata instance + * + * @cpumask: cpumask that padata uses for parallelization + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc(const struct cpumask *cpumask, + struct workqueue_struct *wq) +{ + int err; + struct padata_instance *pinst; + struct parallel_data *pd; + + pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); + if (!pinst) + goto err; + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) + goto err_free_inst; + + rcu_assign_pointer(pinst->pd, pd); + + pinst->wq = wq; + + cpumask_copy(pinst->cpumask, cpumask); + + pinst->flags = 0; + + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + err = register_hotcpu_notifier(&pinst->cpu_notifier); + if (err) + goto err_free_pd; + + mutex_init(&pinst->lock); + + return pinst; + +err_free_pd: + padata_free_pd(pd); +err_free_inst: + kfree(pinst); +err: + return NULL; +} +EXPORT_SYMBOL(padata_alloc); + +/* + * padata_free - free a padata instance + * + * @ padata_inst: padata instance to free + */ +void padata_free(struct padata_instance *pinst) +{ + padata_stop(pinst); + + synchronize_rcu(); + + while (atomic_read(&pinst->pd->refcnt) != 0) + yield(); + + unregister_hotcpu_notifier(&pinst->cpu_notifier); + padata_free_pd(pinst->pd); + kfree(pinst); +} +EXPORT_SYMBOL(padata_free); -- cgit v1.2.3 From 809826a389040e0ad9d646b587bccc0e34691afd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:34 +0800 Subject: tracing: Have __dynamic_array() define a field This is part of a patch set that removes the show_format method in the ftrace event macros. This patch set requires that all fields are added to the ftrace_event_call->fields. This patch changes __dynamic_array() to call trace_define_field() to include fields that use __dynamic_array(). Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D36.8090100@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4fa5dc1ee4e..9978a4f40090 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -175,7 +175,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ return ret; #undef __dynamic_array -#define __dynamic_array(type, item) +#define __dynamic_array(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + 0, is_signed_type(type), FILTER_OTHER);\ + if (ret) \ + return ret; #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -- cgit v1.2.3 From 509e760cd91c831983097ae174cb6c0b8c6c8e6b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:42 +0800 Subject: tracing: Add print_fmt field This is part of a patch set that removes the show_format method in the ftrace event macros. The print_fmt field is added to hold the string that shows the print_fmt in the event format files. This patch only adds the field but it is currently not used. Later patches will use this field to enable us to remove the show_format field and function. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D3E.2000704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9978a4f40090..95d14b640a66 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -203,6 +203,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) return 0; } +#undef __entry +#define __entry REC + #undef __field #define __field(type, item) @@ -218,6 +221,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #undef __dynamic_array #define __dynamic_array(type, item) +#undef F_printk +#define F_printk(fmt, args...) #fmt ", " __stringify(args) + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ \ @@ -228,6 +234,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ + .print_fmt = print, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -- cgit v1.2.3 From 50307a45f8515f6244e3b08e6b19824b9fbfe293 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:45 +0800 Subject: tracing/syscalls: Init print_fmt for syscall events This is part of a patch set that removes the show_format method in the ftrace event macros. Add the print_fmt initialization to the syscall events. The print_fmt is still not used, but will be in the follow up patches. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D41.609@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 68 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 75289f372dd2..1352b0a36fac 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -191,6 +191,67 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) return trace_seq_putc(s, '\n'); } +static +int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int i; + int pos = 0; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_syscall_print_fmt(struct ftrace_event_call *call) +{ + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; + + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; + return 0; + } + + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_syscall_print_fmt(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + if (entry->enter_event == call) + kfree(call->print_fmt); +} + int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) { int ret; @@ -386,9 +447,14 @@ int init_syscall_trace(struct ftrace_event_call *call) { int id; + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + id = register_ftrace_event(call->event); - if (!id) + if (!id) { + free_syscall_print_fmt(call); return -ENODEV; + } call->id = id; INIT_LIST_HEAD(&call->fields); return 0; -- cgit v1.2.3 From a342a0280b981c130e32dbb94dbd3a57959c4d04 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:49 +0800 Subject: tracing/kprobes: Init print_fmt for kprobe events This is part of a patch set that removes the show_format method in the ftrace event macros. Add the print_fmt initialization to the kprobe events. The print_fmt is still not used, but will be in the follow up patches. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D45.3080100@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 64 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6ea90c0e2c96..147491dccead 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1250,6 +1250,62 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, ", REC->" FIELD_STRING_RETIP); } +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!probe_is_return(tp)) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", + tp->args[i].name); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_print_fmt(struct trace_probe *tp) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1); + tp->call.print_fmt = print_fmt; + + return 0; +} + #ifdef CONFIG_EVENT_PROFILE /* Kprobe profile handler */ @@ -1456,10 +1512,14 @@ static int register_probe_event(struct trace_probe *tp) call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } + if (set_print_fmt(tp) < 0) + return -ENOMEM; call->event = &tp->event; call->id = register_ftrace_event(&tp->event); - if (!call->id) + if (!call->id) { + kfree(call->print_fmt); return -ENODEV; + } call->enabled = 0; call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; @@ -1472,6 +1532,7 @@ static int register_probe_event(struct trace_probe *tp) ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); + kfree(call->print_fmt); unregister_ftrace_event(&tp->event); } return ret; @@ -1481,6 +1542,7 @@ static void unregister_probe_event(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ trace_remove_event_call(&tp->call); + kfree(tp->call.print_fmt); } /* Make a debugfs interface for controling probe points */ -- cgit v1.2.3 From c7ef3a9004201bca90626db246a19dadd2c29c9b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Dec 2009 21:13:59 -0500 Subject: tracing: Have syscall tracing call its own init function In the clean up of having all events call one specific function, the syscall event init was changed to call this helper function. With the new print_fmt updates, the syscalls need to do special initializations. This patch converts the syscall events to call its own init function again. Cc: Lai Jiangshan Cc: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 1352b0a36fac..a78e86349ecb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -450,14 +450,14 @@ int init_syscall_trace(struct ftrace_event_call *call) if (set_syscall_print_fmt(call) < 0) return -ENOMEM; - id = register_ftrace_event(call->event); - if (!id) { + id = trace_event_raw_init(call); + + if (id < 0) { free_syscall_print_fmt(call); - return -ENODEV; + return id; } - call->id = id; - INIT_LIST_HEAD(&call->fields); - return 0; + + return id; } int __init init_ftrace_syscalls(void) -- cgit v1.2.3 From 5a65e956220efc2421e21ee56d6153fd5c533a95 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:53 +0800 Subject: tracing: Use defined fields and print_fmt to print formats The calls ftrace_format_##call() and ftrace_define_fields_##call() are almost duplicate in functionality. With the addition of the print_fmt in previous patches, these two functions can be merged into one. The trace_define_field() defines the fields and links them into the struct ftrace_event_call. The previous patches introduced the print_fmt field and this can now be used with the trace_define_field() to create the event format file fields and print_fmt field. The struct ftrace_event_call->fields are used to print the fields The struct ftrace_event_call->print_fmt is used to print the "print fmt: XXXXXXXXXXX" line. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D49.5000006@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 65 ++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 189b09baf4fb..250ec865d5f5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -528,33 +528,16 @@ extern char *__bad_type_size(void); #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name), is_signed_type(type) -static int trace_write_header(struct trace_seq *s) -{ - struct trace_entry field; - - /* struct trace_entry */ - return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); -} - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_field *field; struct trace_seq *s; + int common_field_count = 5; char *buf; - int r; + int r = 0; if (*ppos) return 0; @@ -565,14 +548,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - /* If any of the first writes fail, so will the show_format. */ - trace_seq_printf(s, "name: %s\n", call->name); trace_seq_printf(s, "ID: %d\n", call->id); trace_seq_printf(s, "format:\n"); - trace_write_header(s); - r = call->show_format(call, s); + list_for_each_entry_reverse(field, &call->fields, link) { + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + const char *array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) { + r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + } else { + r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + } + + if (--common_field_count == 0) + r = trace_seq_printf(s, "\n"); + + if (!r) + break; + } + + if (r) + r = trace_seq_printf(s, "\nprint fmt: %s\n", + call->print_fmt); + if (!r) { /* * ug! The format output is bigger than a PAGE!! -- cgit v1.2.3 From 0fa0edaf32b9a78b9854f1da98d4511a501089b0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:57 +0800 Subject: tracing: Remove show_format and related macros from TRACE_EVENT The previous patches added the use of print_fmt string and changes the trace_define_field() function to also create the fields and format output for the event format files. text data bss dec hex filename 5857201 1355780 9336808 16549789 fc879d vmlinux 5884589 1351684 9337896 16574169 fce6d9 vmlinux-orig The above shows the size of the vmlinux after this patch set compared to the vmlinux-orig which is before the patch set. This saves us 27k on text, 1k on bss and adds just 4k of data. The total savings of 24k in size. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D4D.40604@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ------- kernel/trace/trace_export.c | 73 ---------------------------------------- kernel/trace/trace_kprobe.c | 78 ------------------------------------------- kernel/trace/trace_syscalls.c | 66 ------------------------------------ 4 files changed, 229 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 250ec865d5f5..c2a3077b7353 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -520,14 +520,6 @@ out: return ret; } -extern char *__bad_type_size(void); - -#undef FIELD -#define FIELD(type, name) \ - sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ - #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name), is_signed_type(type) - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -965,10 +957,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, filter); } - /* A trace may not want to export its format */ - if (!call->show_format) - return 0; - trace_create_file("format", 0444, call->dir, call, format); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 95d14b640a66..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ #include "trace_entries.h" - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array_desc -#define __array_desc(type, container, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef F_printk -#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef __entry -#define __entry REC - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct struct_name field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -235,7 +163,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ .print_fmt = print, \ - .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 147491dccead..c99029916c76 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,82 +1174,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __probe_event_show_format(struct trace_seq *s, - struct trace_probe *tp, const char *fmt, - const char *arg) -{ - int i; - - /* Show format */ - if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) - return 0; - - if (!trace_seq_printf(s, "\", %s", arg)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) - return 0; - - return trace_seq_puts(s, "\n"); -} - -#undef SHOW_FIELD -#define SHOW_FIELD(type, item, name) \ - do { \ - ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ - "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ - (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; \ - } while (0) - -static int kprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx)", - "REC->" FIELD_STRING_IP); -} - -static int kretprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kretprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); - SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx <- %lx)", - "REC->" FIELD_STRING_FUNC - ", REC->" FIELD_STRING_RETIP); -} - static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) { int i; @@ -1504,12 +1428,10 @@ static int register_probe_event(struct trace_probe *tp) if (probe_is_return(tp)) { tp->event.trace = print_kretprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kretprobe_event_show_format; call->define_fields = kretprobe_event_define_fields; } else { tp->event.trace = print_kprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a78e86349ecb..49cea70fbf6d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -143,54 +143,6 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) -{ - int i; - int ret; - struct syscall_metadata *entry = call->data; - struct syscall_trace_enter trace; - int offset = offsetof(struct syscall_trace_enter, args); - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr)); - if (!ret) - return 0; - - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], - entry->args[i]); - if (!ret) - return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" - "\tsigned:%u;\n", offset, - sizeof(unsigned long), - is_signed_type(unsigned long)); - if (!ret) - return 0; - offset += sizeof(unsigned long); - } - - trace_seq_puts(s, "\nprint fmt: \""); - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: