From 1f5026a7e21e409c2b9dd54f6dfb9446511fb7c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Jul 2011 09:58:09 +0200 Subject: memblock: Kill MEMBLOCK_ERROR 25818f0f28 (memblock: Make MEMBLOCK_ERROR be 0) thankfully made MEMBLOCK_ERROR 0 and there already are codes which expect error return to be 0. There's no point in keeping MEMBLOCK_ERROR around. End its misery. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1310457490-3356-6-git-send-email-tj@kernel.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Signed-off-by: H. Peter Anvin --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 35185392173f..b1d5a6174d65 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -199,7 +199,7 @@ void __init setup_log_buf(int early) unsigned long mem; mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (mem == MEMBLOCK_ERROR) + if (!mem) return; new_log_buf = __va(mem); } else { -- cgit v1.2.3 From c2bc11113c50449f23c40b724fe410fc2380a8e9 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 27 Oct 2011 18:12:42 -0700 Subject: time: Improve documentation of timekeeeping_adjust() After getting a number of questions in private emails about the math around admittedly very complex timekeeping_adjust() and timekeeping_big_adjust(), I figure the code needs some better comments. Hopefully the explanations are clear enough and don't muddy the water any worse. Still needs documentation for ntp_error, but I couldn't recall exactly the full explanation behind the code that's there (although I do recall once working it out when Roman first proposed it). Given a bit more time I can probably work it out, but I don't want to hold back this documentation until then. Signed-off-by: John Stultz Cc: Chen Jie Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1319764362-32367-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e8507..025e136f3881 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -802,14 +802,44 @@ static void timekeeping_adjust(s64 offset) s64 error, interval = timekeeper.cycle_interval; int adj; + /* + * The point of this is to check if the error is greater then half + * an interval. + * + * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. + * + * Note we subtract one in the shift, so that error is really error*2. + * This "saves" dividing(shifting) intererval twice, but keeps the + * (error > interval) comparision as still measuring if error is + * larger then half an interval. + * + * Note: It does not "save" on aggrivation when reading the code. + */ error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { + /* + * We now divide error by 4(via shift), which checks if + * the error is greater then twice the interval. + * If it is greater, we need a bigadjust, if its smaller, + * we can adjust by 1. + */ error >>= 2; + /* + * XXX - In update_wall_time, we round up to the next + * nanosecond, and store the amount rounded up into + * the error. This causes the likely below to be unlikely. + * + * The properfix is to avoid rounding up by using + * the high precision timekeeper.xtime_nsec instead of + * xtime.tv_nsec everywhere. Fixing this will take some + * time. + */ if (likely(error <= interval)) adj = 1; else adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { + /* See comment above, this is just switched for the negative */ error >>= 2; if (likely(error >= -interval)) { adj = -1; @@ -817,9 +847,58 @@ static void timekeeping_adjust(s64 offset) offset = -offset; } else adj = timekeeping_bigadjust(error, &interval, &offset); - } else + } else /* No adjustment needed */ return; + /* + * So the following can be confusing. + * + * To keep things simple, lets assume adj == 1 for now. + * + * When adj != 1, remember that the interval and offset values + * have been appropriately scaled so the math is the same. + * + * The basic idea here is that we're increasing the multiplier + * by one, this causes the xtime_interval to be incremented by + * one cycle_interval. This is because: + * xtime_interval = cycle_interval * mult + * So if mult is being incremented by one: + * xtime_interval = cycle_interval * (mult + 1) + * Its the same as: + * xtime_interval = (cycle_interval * mult) + cycle_interval + * Which can be shortened to: + * xtime_interval += cycle_interval + * + * So offset stores the non-accumulated cycles. Thus the current + * time (in shifted nanoseconds) is: + * now = (offset * adj) + xtime_nsec + * Now, even though we're adjusting the clock frequency, we have + * to keep time consistent. In other words, we can't jump back + * in time, and we also want to avoid jumping forward in time. + * + * So given the same offset value, we need the time to be the same + * both before and after the freq adjustment. + * now = (offset * adj_1) + xtime_nsec_1 + * now = (offset * adj_2) + xtime_nsec_2 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_2) + xtime_nsec_2 + * And we know: + * adj_2 = adj_1 + 1 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * (adj_1+1)) + xtime_nsec_2 + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_1) + offset + xtime_nsec_2 + * Canceling the sides: + * xtime_nsec_1 = offset + xtime_nsec_2 + * Which gives us: + * xtime_nsec_2 = xtime_nsec_1 - offset + * Which simplfies to: + * xtime_nsec -= offset + * + * XXX - TODO: Doc ntp_error calculation. + */ timekeeper.mult += adj; timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; -- cgit v1.2.3 From 6d21af4f7d0ab660b24c8635f4ed577f40cd2978 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Wed, 26 Oct 2011 10:16:11 +0100 Subject: irq: Fix comment typo ist->is Signed-off-by: Javi Merino Signed-off-by: Jiri Kosina --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9b956fa20308..3261c4d478a2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1281,7 +1281,7 @@ EXPORT_SYMBOL(free_irq); * and to set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler ist + * then you need to supply @handler and @thread_fn. @handler is * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return -- cgit v1.2.3 From d631097577f6fe027f4903f62eabced6445d19bf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 31 Oct 2011 11:07:42 +0200 Subject: tracing: fix event_subsystem ref counting Fix a bug introduced by e9dbfae5, which prevents event_subsystem from ever being released. Ref_count was added to keep track of subsystem users, not for counting events. Subsystem is created with ref_count = 1, so there is no need to increment it for every event, we have nr_events for that. Fix this by touching ref_count only when we actually have a new user - subsystem_open(). Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Link: http://lkml.kernel.org/r/1320052062-7846-1-git-send-email-idryomov@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f387..c212a7f934ec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { - __get_system(system); system->nr_events++; return system->entry; } -- cgit v1.2.3 From ed0449af5373abd766c79fbf83254bebc996bd23 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Nov 2011 09:09:35 +0800 Subject: tracing: Restore system filter behavior Though not all events have field 'prev_pid', it was allowed to do this: # echo 'prev_pid == 100' > events/sched/filter but commit 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 (tracing/filter: Swap entire filter of events) broke it without any reason. Link: http://lkml.kernel.org/r/4EAF46CF.8040408@cn.fujitsu.com Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 816d3d074979..86040d9c1acc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1649,7 +1649,9 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - goto fail; + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } list_for_each_entry(call, &ftrace_events, list) { @@ -1658,6 +1660,9 @@ static int replace_system_preds(struct event_subsystem *system, if (strcmp(call->class->system, system->name) != 0) continue; + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + continue; + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; -- cgit v1.2.3 From dcfce4a095932e6e95d83ad982be3609947963bc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 11 Oct 2011 17:11:08 +0200 Subject: oprofile, x86: Reimplement nmi timer mode using perf event The legacy x86 nmi watchdog code was removed with the implementation of the perf based nmi watchdog. This broke Oprofile's nmi timer mode. To run nmi timer mode we relied on a continuous ticking nmi source which the nmi watchdog provided. The nmi tick was no longer available and current watchdog can not be used anymore since it runs with very long periods in the range of seconds. This patch reimplements the nmi timer mode using a perf counter nmi source. V2: * removing pr_info() * fix undefined reference to `__udivdi3' for 32 bit build * fix section mismatch of .cpuinit.data:nmi_timer_cpu_nb * removed nmi timer setup in arch/x86 * implemented function stubs for op_nmi_init/exit() * made code more readable in oprofile_init() V3: * fix architectural initialization in oprofile_init() * fix CONFIG_OPROFILE_NMI_TIMER dependencies Acked-by: Peter Zijlstra Signed-off-by: Robert Richter --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d1a1bee35228..d2e28bdd523a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1322,6 +1322,7 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, @@ -1806,6 +1807,7 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_enable); int perf_event_refresh(struct perf_event *event, int refresh) { -- cgit v1.2.3 From 49aa29513ec995f201664cf6eee36e5326ed38bf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 16:46:46 -0400 Subject: tracing: Add boiler plate for subsystem filter The system filter can be used to set multiple event filters that exist within the system. But currently it displays the last filter written that does not necessarily correspond to the filters within the system. The system filter itself is not used to filter any events. The system filter is just a means to set filters of the events within it. Because this causes an ambiguous state when the system filter reads a filter string but the events within the system have different strings it is best to just show a boiler plate: ### global filter ### # Use this to set filters for multiple events. # Only events with the given fields will be affected. # If no events are modified, an error message will be displayed here. If an error occurs while writing to the system filter, the system filter will replace the boiler plate with the error message as it currently does. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 86040d9c1acc..fdc6d22d406b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,6 +27,12 @@ #include "trace.h" #include "trace_output.h" +#define DEFAULT_SYS_FILTER_MESSAGE \ + "### global filter ###\n" \ + "# Use this to set filters for multiple events.\n" \ + "# Only events with the given fields will be affected.\n" \ + "# If no events are modified, an error message will be displayed here" + enum filter_op_ids { OP_OR, @@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); + trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } @@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!filter) goto out; - replace_filter_string(filter, filter_string); + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + /* * No event actually uses the system filter * we can free it without synchronize_sched(). @@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); - if (err) { - append_filter_err(ps, system->filter); - goto out; - } + if (err) + goto err_filter; err = replace_system_preds(system, ps, filter_string); if (err) - append_filter_err(ps, system->filter); + goto err_filter; out: filter_opstack_clear(ps); @@ -1865,6 +1872,11 @@ out_unlock: mutex_unlock(&event_mutex); return err; + +err_filter: + replace_filter_string(filter, filter_string); + append_filter_err(ps, system->filter); + goto out; } #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From e5e78d08f3ab3094783b8df08a5b6d1d1a56a58f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 20:24:16 -0400 Subject: lockdep: Show subclass in pretty print of lockdep output The pretty print of the lockdep debug splat uses just the lock name to show how the locking scenario happens. But when it comes to nesting locks, the output becomes confusing which takes away the point of the pretty printing of the lock scenario. Without displaying the subclass info, we get the following output: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slock-AF_INET); lock(slock-AF_INET); lock(slock-AF_INET); lock(slock-AF_INET); *** DEADLOCK *** The above looks more of a A->A locking bug than a A->B B->A. By adding the subclass to the output, we can see what really happened: other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slock-AF_INET); lock(slock-AF_INET/1); lock(slock-AF_INET); lock(slock-AF_INET/1); *** DEADLOCK *** This bug was discovered while tracking down a real bug caught by lockdep. Link: http://lkml.kernel.org/r/20111025202049.GB25043@hostway.ca Cc: Peter Zijlstra Reported-by: Thomas Gleixner Tested-by: Simon Kirby Signed-off-by: Steven Rostedt --- kernel/lockdep.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 91d67ce3a8d5..6bd915df5fd3 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -490,36 +490,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static int __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; - name = class->name; - if (!name) - name = __get_key_name(class->key, str); - - return printk("%s", name); -} - -static void print_lock_name(struct lock_class *class) -{ - char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; - const char *name; - - get_usage_chars(class, usage); - name = class->name; if (!name) { name = __get_key_name(class->key, str); - printk(" (%s", name); + printk("%s", name); } else { - printk(" (%s", name); + printk("%s", name); if (class->name_version > 1) printk("#%d", class->name_version); if (class->subclass) printk("/%d", class->subclass); } +} + +static void print_lock_name(struct lock_class *class) +{ + char usage[LOCK_USAGE_CHARS]; + + get_usage_chars(class, usage); + + printk(" ("); + __print_lock_name(class); printk("){%s}", usage); } -- cgit v1.2.3 From 8ee3c92b7f2751c392be2d8fc360a410480b8757 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 10:45:23 -0400 Subject: ftrace: Remove force undef config value left for testing A forced undef of a config value was used for testing and was accidently left in during the final commit. This causes x86 to run slower than needed while running function tracing as well as causes the function graph selftest to fail when DYNMAIC_FTRACE is not set. This is because the code in MCOUNT expects the ftrace code to be processed with the config value set that happened to be forced not set. The forced config option was left in by: commit 6331c28c962561aee59e5a493b7556a4bb585957 ftrace: Fix dynamic selftest failure on some archs Link: http://lkml.kernel.org/r/20111102150255.GA6973@debian Cc: stable@vger.kernel.org Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 077d85387908..0fcc6caead1c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,7 +151,6 @@ void clear_ftrace_function(void) ftrace_pid_function = ftrace_stub; } -#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their -- cgit v1.2.3 From c8452afb7426f7e21388492f40227582e3e83879 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 18 Oct 2011 19:55:51 +0200 Subject: jump_label: jump_label_inc may return before the code is patched If cpu A calls jump_label_inc() just after atomic_add_return() is called by cpu B, atomic_inc_not_zero() will return value greater then zero and jump_label_inc() will return to a caller before jump_label_update() finishes its job on cpu B. Link: http://lkml.kernel.org/r/20111018175551.GH17571@redhat.com Cc: stable@vger.kernel.org Cc: Peter Zijlstra Acked-by: Jason Baron Signed-off-by: Gleb Natapov Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a8ce45097f3d..e6f1f24ad577 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -66,8 +66,9 @@ void jump_label_inc(struct jump_label_key *key) return; jump_label_lock(); - if (atomic_add_return(1, &key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) jump_label_update(key, JUMP_LABEL_ENABLE); + atomic_inc(&key->enabled); jump_label_unlock(); } -- cgit v1.2.3 From d4d34b981a5327eec956c6cb4cce397ce6f57279 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 20:32:39 -0400 Subject: ftrace: Fix hash record accounting bug If the set_ftrace_filter is cleared by writing just whitespace to it, then the filter hash refcounts will be decremented but not updated. This causes two bugs: 1) No functions will be enabled for tracing when they all should be 2) If the users clears the set_ftrace_filter twice, it will crash ftrace: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/git/linux-trace.git/kernel/trace/ftrace.c:1384 __ftrace_hash_rec_update.part.27+0x157/0x1a7() Modules linked in: Pid: 2330, comm: bash Not tainted 3.1.0-test+ #32 Call Trace: [] warn_slowpath_common+0x83/0x9b [] warn_slowpath_null+0x1a/0x1c [] __ftrace_hash_rec_update.part.27+0x157/0x1a7 [] ? ftrace_regex_release+0xa7/0x10f [] ? kfree+0xe5/0x115 [] ftrace_hash_move+0x2e/0x151 [] ftrace_regex_release+0xba/0x10f [] fput+0xfd/0x1c2 [] filp_close+0x6d/0x78 [] sys_dup3+0x197/0x1c1 [] sys_dup2+0x4f/0x54 [] system_call_fastpath+0x16/0x1b ---[ end trace 77a3a7ee73794a02 ]--- Link: http://lkml.kernel.org/r/20111101141420.GA4918@debian Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fcc6caead1c..7caa4505508d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1210,7 +1210,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (!src->count) { free_ftrace_hash_rcu(*dst); rcu_assign_pointer(*dst, EMPTY_HASH); - return 0; + /* still need to update the function records */ + ret = 0; + goto out; } /* -- cgit v1.2.3 From 7e9a49ef542610609144d1afcd516dc3fafac4d6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Nov 2011 16:08:49 +0100 Subject: tracing/latency: Fix header output for latency tracers In case the the graph tracer (CONFIG_FUNCTION_GRAPH_TRACER) or even the function tracer (CONFIG_FUNCTION_TRACER) are not set, the latency tracers do not display proper latency header. The involved/fixed latency tracers are: wakeup_rt wakeup preemptirqsoff preemptoff irqsoff The patch adds proper handling of tracer configuration options for latency tracers, and displaying correct header info accordingly. * The current output (for wakeup tracer) with both graph and function tracers disabled is: # tracer: wakeup # -0 0d.h5 1us+: 0:120:R + [000] 7: 0:R watchdog/0 -0 0d.h5 3us+: ttwu_do_activate.clone.1 <-try_to_wake_up ... * The fixed output is: # tracer: wakeup # # wakeup latency trace v1.1.5 on 3.1.0-tip+ # -------------------------------------------------------------------- # latency: 55 us, #4/4, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: migration/0-6 (uid:0 nice:0 policy:1 rt_prio:99) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / cat-1129 0d..4 1us : 1129:120:R + [000] 6: 0:R migration/0 cat-1129 0d..4 2us+: ttwu_do_activate.clone.1 <-try_to_wake_up * The current output (for wakeup tracer) with only function tracer enabled is: # tracer: wakeup # cat-1140 0d..4 1us+: 1140:120:R + [000] 6: 0:R migration/0 cat-1140 0d..4 2us : ttwu_do_activate.clone.1 <-try_to_wake_up * The fixed output is: # tracer: wakeup # # wakeup latency trace v1.1.5 on 3.1.0-tip+ # -------------------------------------------------------------------- # latency: 207 us, #109/109, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: watchdog/1-12 (uid:0 nice:0 policy:1 rt_prio:99) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / -0 1d.h5 1us+: 0:120:R + [001] 12: 0:R watchdog/1 -0 1d.h5 3us : ttwu_do_activate.clone.1 <-try_to_wake_up Link: http://lkml.kernel.org/r/20111107150849.GE1807@m.brq.redhat.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 13 ++++++++++++- kernel/trace/trace_sched_wakeup.c | 13 ++++++++++++- 4 files changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b24a72d35008..b296186eb93a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2140,6 +2140,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } +void trace_latency_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_trace_header(m, iter); + + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); +} + void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18dc..f8ec2291b522 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a1a3359996a7..a248c686b2b8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) } static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } -static void irqsoff_print_header(struct seq_file *s) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b6..ff791ea48b57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) } static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } -static void wakeup_print_header(struct seq_file *s) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void wakeup_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void wakeup_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* -- cgit v1.2.3 From d65670a78cdbfae94f20a9e05ec705871d7cdf2b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 31 Oct 2011 17:06:35 -0400 Subject: clocksource: Avoid selecting mult values that might overflow when adjusted For some frequencies, the clocks_calc_mult_shift() function will unfortunately select mult values very close to 0xffffffff. This has the potential to overflow when NTP adjusts the clock, adding to the mult value. This patch adds a clocksource.maxadj value, which provides an approximation of an 11% adjustment(NTP limits adjustments to 500ppm and the tick adjustment is limited to 10%), which could be made to the clocksource.mult value. This is then used to both check that the current mult value won't overflow/underflow, as well as warning us if the timekeeping_adjust() code pushes over that 11% boundary. v2: Fix max_adjustment calculation, and improve WARN_ONCE messages. v3: Don't warn before maxadj has actually been set CC: Yong Zhang CC: David Daney CC: Thomas Gleixner CC: Chen Jie CC: zhangfx CC: stable@kernel.org Reported-by: Chen Jie Reported-by: zhangfx Tested-by: Yong Zhang Signed-off-by: John Stultz --- kernel/time/clocksource.c | 58 +++++++++++++++++++++++++++++++++++++++-------- kernel/time/timekeeping.c | 7 ++++++ 2 files changed, 55 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e096..cfc65e1eb9fb 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -491,6 +491,22 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +/** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs: Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more then 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + /** * clocksource_max_deferment - Returns max time the clocksource can be deferred * @cs: Pointer to clocksource @@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/cs->mult which - * is equivalent to the below. - * max_cycles < (2^63)/cs->mult - * max_cycles < 2^(log2((2^63)/cs->mult)) - * max_cycles < 2^(log2(2^63) - log2(cs->mult)) - * max_cycles < 2^(63 - log2(cs->mult)) - * max_cycles < 1 << (63 - log2(cs->mult)) + * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) + * which is equivalent to the below. + * max_cycles < (2^63)/(cs->mult + cs->maxadj) + * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) + * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) + * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) + * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is * determined by the minimum of max_cycles and cs->mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. */ max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, + cs->shift); /* * To ensure that the clocksource does not wrap whilst we are idle, @@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; - /* * Calc the maximum number of seconds which we can run before * wrapping around. For clocksources which have a mask > 32bit @@ -661,6 +679,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); + + /* + * for clocksources that have large mults, to avoid overflow. + * Since mult may be adjusted by ntp, add an safety extra margin + * + */ + cs->maxadj = clocksource_max_adjustment(cs); + while ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult)) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + cs->max_idle_ns = clocksource_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -701,6 +733,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); */ int clocksource_register(struct clocksource *cs) { + /* calculate max adjustment for given mult/shift */ + cs->maxadj = clocksource_max_adjustment(cs); + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + /* calculate max idle time permitted for this clocksource */ cs->max_idle_ns = clocksource_max_deferment(cs); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e8507..e65ff3171102 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -820,6 +820,13 @@ static void timekeeping_adjust(s64 offset) } else return; + WARN_ONCE(timekeeper.clock->maxadj && + (timekeeper.mult + adj > timekeeper.clock->mult + + timekeeper.clock->maxadj), + "Adjusting %s more then 11%% (%ld vs %ld)\n", + timekeeper.clock->name, (long)timekeeper.mult + adj, + (long)timekeeper.clock->mult + + timekeeper.clock->maxadj); timekeeper.mult += adj; timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; -- cgit v1.2.3 From cf5f0acf3935c91379e709a71ecf68805d366659 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Oct 2011 16:52:28 +0200 Subject: sched: Add a comment to effective_load() since it's a pain Every time I have to stare at this function I need to completely reverse engineer its workings, about time I write a comment explaining the thing. Collected bits and pieces from previous changelogs, mostly: 4be9daaa1b33701f011f4117f22dc1e45a3e6e34 83378269a5fad98f562ebc0f09c349575e6cbfe1 Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1318518057.27731.2.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 113 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c9e67923b7c..aba20f495188 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ + long tg_weight; + + /* + * Use this CPU's actual weight instead of the last load_contribution + * to gain a more accurate current total weight. See + * update_cfs_rq_load_contribution(). + */ + tg_weight = atomic_read(&tg->load_weight); + tg_weight -= cfs_rq->load_contribution; + tg_weight += cfs_rq->load.weight; + + return tg_weight; +} + static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long load_weight, load, shares; + long tg_weight, load, shares; + tg_weight = calc_tg_weight(tg, cfs_rq); load = cfs_rq->load.weight; - load_weight = atomic_read(&tg->load_weight); - load_weight += load; - load_weight -= cfs_rq->load_contribution; - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; + if (tg_weight) + shares /= tg_weight; if (shares < MIN_SHARES) shares = MIN_SHARES; @@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. + * + * Calculate the effective load difference if @wl is added (subtracted) to @tg + * on this @cpu and results in a total addition (subtraction) of @wg to the + * total group weight. + * + * Given a runqueue weight distribution (rw_i) we can compute a shares + * distribution (s_i) using: + * + * s_i = rw_i / \Sum rw_j (1) + * + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting + * shares distribution (s_i): + * + * rw_i = { 2, 4, 1, 0 } + * s_i = { 2/7, 4/7, 1/7, 0 } + * + * As per wake_affine() we're interested in the load of two CPUs (the CPU the + * task used to run on and the CPU the waker is running on), we need to + * compute the effect of waking a task on either CPU and, in case of a sync + * wakeup, compute the effect of the current task going to sleep. + * + * So for a change of @wl to the local @cpu with an overall group weight change + * of @wl we can compute the new shares distribution (s'_i) using: + * + * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) + * + * Suppose we're interested in CPUs 0 and 1, and want to compute the load + * differences in waking a task to CPU 0. The additional task changes the + * weight and shares distributions like: + * + * rw'_i = { 3, 4, 1, 0 } + * s'_i = { 3/8, 4/8, 1/8, 0 } + * + * We can then compute the difference in effective weight by using: + * + * dw_i = S * (s'_i - s_i) (3) + * + * Where 'S' is the group weight as seen by its parent. + * + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - + * 4/7) times the weight of the group. */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent) + if (!tg->parent) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { - long lw, w; + long w, W; tg = se->my_q->tg; - w = se->my_q->load.weight; - /* use this cpu's instantaneous contribution */ - lw = atomic_read(&tg->load_weight); - lw -= se->my_q->load_contribution; - lw += w + wg; + /* + * W = @wg + \Sum rw_j + */ + W = wg + calc_tg_weight(tg, se->my_q); - wl += w; + /* + * w = rw_i + @wl + */ + w = se->my_q->load.weight + wl; - if (lw > 0 && wl < lw) - wl = (wl * tg->shares) / lw; + /* + * wl = S * s'_i; see (2) + */ + if (W > 0 && w < W) + wl = (w * tg->shares) / W; else wl = tg->shares; - /* zero point is MIN_SHARES */ + /* + * Per the above, wl is the new se->load.weight value; since + * those are clipped to [MIN_SHARES, ...) do so now. See + * calc_cfs_shares(). + */ if (wl < MIN_SHARES) wl = MIN_SHARES; + + /* + * wl = dw_i = S * (s'_i - s_i); see (3) + */ wl -= se->load.weight; + + /* + * Recursively apply this logic to all parent groups to compute + * the final effective load change on the root group. Since + * only the @tg group gets extra weight, all parent groups can + * only redistribute existing shares. @wl is the shift in shares + * resulting from this level per the above. + */ wg = 0; } -- cgit v1.2.3 From 461819ac8ee950ce027c72a066156a3df9e60c7e Mon Sep 17 00:00:00 2001 From: Hui Kang Date: Tue, 11 Oct 2011 23:00:59 -0400 Subject: sched_fair: Fix a typo in the comment describing update_sd_lb_stats Signed-off-by: Hui Kang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1318388459-4427-1-git-send-email-hkang.sunysb@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aba20f495188..7e51b5bb27cc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3588,7 +3588,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, } /** - * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu -- cgit v1.2.3 From c6dc7f055d333ef35f397b8d7c3abcd1918bf8cb Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 6 Oct 2011 15:22:46 -0400 Subject: sched: Document wait_for_completion_*() return values The return-value convention for these functions varies depending on whether they're interruptible or can timeout. It can be a little confusing--document it. Signed-off-by: J. Bruce Fields Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111006192246.GB28026@fieldses.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be3..3d2c436959a1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4810,6 +4810,9 @@ EXPORT_SYMBOL(wait_for_completion); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -4824,6 +4827,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits for completion of a specific task to be signaled. It is * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_interruptible(struct completion *x) { @@ -4841,6 +4846,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_interruptible_timeout(struct completion *x, @@ -4856,6 +4864,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); * * This waits to be signaled for completion of a specific task. It can be * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_killable(struct completion *x) { @@ -4874,6 +4884,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); * This waits for either a completion of a specific task to be * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_killable_timeout(struct completion *x, -- cgit v1.2.3 From 4a6184ce7a48c478dee0d8a9ed74c1fa35161858 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Oct 2011 22:39:14 +0200 Subject: sched, rt: Provide means of disabling cross-cpu bandwidth sharing Normally the RT bandwidth scheme will share bandwidth across the entire root_domain. However sometimes its convenient to disable this sharing for debug purposes. Provide a simple feature switch to this end. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 1 + kernel/sched_rt.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index efa0a7b75dde..84802245abd2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1) SCHED_FEAT(TTWU_QUEUE, 1) SCHED_FEAT(FORCE_SD_OVERLAP, 0) +SCHED_FEAT(RT_RUNTIME_SHARE, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 056cbd2e2a27..583a1368afe6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; + if (!sched_feat(RT_RUNTIME_SHARE)) + return more; + if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); -- cgit v1.2.3 From f1c6f1a7eed963ed233ba4c8b6fa8addb86c6ddc Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Wed, 26 Oct 2011 23:14:16 +0200 Subject: sched: Set the command name of the idle tasks in SMP kernels In UP systems, the idle task is initialized using the init_task structure from which the command name is taken (currently "swapper"). In SMP systems, one idle task per CPU is forked by the worker thread from which the task structure is copied. The command name is, therefore, "kworker/0:0" or "kworker/0:1", if not updated. Since such update was lacking, all idle tasks in SMP systems were incorrectly named. This longtime bug was not discovered immediately, because there is no /proc/0 entry - the bug only becomes apparent when tracing is enabled. This patch sets the command name of the idle tasks in SMP systems to the name that is used in the INIT_TASK structure suffixed by a slash and the number of the CPU. Signed-off-by: Carsten Emde Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111026211708.768925506@osadl.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3d2c436959a1..d6b149ccf925 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -6112,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); +#if defined(CONFIG_SMP) + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif } /* -- cgit v1.2.3 From 1d5f003f5a964711853514b04ddc872eec0fdc7b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 23 Oct 2011 19:10:33 +0200 Subject: perf: Do not set task_ctx pointer in cpuctx if there are no events in the context Do not set task_ctx pointer during sched_in if there are no events associated with the context. Otherwise if during task execution total number of events in the system will become zero perf_event_context_sched_out() will not be called and cpuctx->task_ctx will be left with a stale value. Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111023171033.GI17571@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f95..b0c1186fd97b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2173,7 +2173,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_event_sched_in(cpuctx, ctx, task); - cpuctx->task_ctx = ctx; + if (ctx->nr_events) + cpuctx->task_ctx = ctx; perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); -- cgit v1.2.3 From 9251f904f95175b4a1d8cbc0449e748f9edd7629 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 16 Oct 2011 17:15:04 +0200 Subject: perf: Carve out callchain functionality Split the callchain code from the perf events core into a new kernel/events/callchain.c file. This simplifies a bit the big core.c Signed-off-by: Borislav Petkov Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian [keep ctx recursion handling inline and use internal headers] Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1318778104-17152-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/Makefile | 2 +- kernel/events/callchain.c | 191 ++++++++++++++++++++++++++++++++++++++++++ kernel/events/core.c | 209 ---------------------------------------------- kernel/events/internal.h | 39 ++++++++- 4 files changed, 230 insertions(+), 211 deletions(-) create mode 100644 kernel/events/callchain.c (limited to 'kernel') diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c36..22d901f9caf4 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o ring_buffer.o +obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 000000000000..057e24b665cf --- /dev/null +++ b/kernel/events/callchain.c @@ -0,0 +1,191 @@ +/* + * Performance events callchain code, extracted from core.c: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include "internal.h" + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +static struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f95..eadac69265fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2569,215 +2569,6 @@ static u64 perf_event_read(struct perf_event *event) return perf_event_count(event); } -/* - * Callchain support - */ - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -static int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -static void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static int get_recursion_context(int *recursion) -{ - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (recursion[rctx]) - return -1; - - recursion[rctx]++; - barrier(); - - return rctx; -} - -static inline void put_recursion_context(int *recursion, int rctx) -{ - barrier(); - recursion[rctx]--; -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} - /* * Initialize the perf_event context in a task_struct: */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116c..be4a43f6de4f 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -1,6 +1,10 @@ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H +#include + +/* Buffer handling */ + #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { @@ -64,7 +68,7 @@ static inline int page_order(struct ring_buffer *rb) } #endif -static unsigned long perf_data_size(struct ring_buffer *rb) +static inline unsigned long perf_data_size(struct ring_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } @@ -93,4 +97,37 @@ __output_copy(struct perf_output_handle *handle, } while (len); } +/* Callchain handling */ +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern int get_callchain_buffers(void); +extern void put_callchain_buffers(void); + +static inline int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + #endif /* _KERNEL_EVENTS_INTERNAL_H */ -- cgit v1.2.3 From 5d81e5cfb37a174e8ddc0413e2e70cdf05807ace Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 7 Nov 2011 15:54:12 +0300 Subject: events: Don't divide events if it has field period This patch solves the following problem: Now some samples may be lost due to throttling. The number of samples is restricted by sysctl_perf_event_sample_rate/HZ. A trace event is divided on some samples according to event's period. I don't sure, that we should generate more than one sample on each trace event. I think the better way to use SAMPLE_PERIOD. E.g.: I want to trace when a process sleeps. I created a process, which sleeps for 1ms and for 4ms. perf got 100 events in both cases. swapper 0 [000] 1141.371830: sched_stat_sleep: comm=foo pid=1801 delay=1386750 [ns] swapper 0 [000] 1141.369444: sched_stat_sleep: comm=foo pid=1801 delay=4499585 [ns] In the first case a kernel want to send 4499585 events and in the second case it wants to send 1386750 events. perf-reports shows that process sleeps in both places equal time. It's bug. With this patch kernel generates one event on each "sleep" and the time slice is saved in the field "period". Perf knows how handle it. Signed-off-by: Andrew Vagin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1320670457-2633428-3-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eadac69265fc..8d9dea56c262 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4528,7 +4528,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct hw_perf_event *hwc = &event->hw; int throttle = 0; - data->period = event->hw.last_period; if (!overflow) overflow = perf_swevent_set_period(event); @@ -4562,6 +4561,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { + data->period = nr; + return perf_swevent_overflow(event, 1, data, regs); + } else + data->period = event->hw.last_period; + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); -- cgit v1.2.3 From 94d24fc47219219b5aa23b45956cc37ee5aa5b01 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Jun 2011 11:17:30 +0200 Subject: printk, lockdep: Disable lock debugging on zap_locks() zap_locks() is used by printk() in a last ditch effort to get data out, clearly we cannot trust lock state after this so make it disable lock debugging. Also don't treat printk recursion through lockdep as a normal recursion bug but try hard to get the lockdep splat out. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kqxwmo4xz37e1s8w0xopvr0q@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/printk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 1455a0d4eedd..6d087944e72a 100644 --- a/kernel/printk.c +++ b/ker