From 3f6da3905398826d85731247e7fbcf53400c18bd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Mar 2010 13:01:18 +0100
Subject: perf: Rework and fix the arch CPU-hotplug hooks

Remove the hw_perf_event_*() hotplug hooks in favour of per PMU hotplug
notifiers. This has the advantage of reducing the static weak interface
as well as exposing all hotplug actions to the PMU.

Use this to fix x86 hotplug usage where we did things in ONLINE which
should have been done in UP_PREPARE or STARTING.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <20100305154128.736225361@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'kernel')
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 4393b9e73740..73329dedb5ad 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -81,10 +81,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
-void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
-void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
-void __weak hw_perf_event_setup_offline(int cpu)	{ barrier(); }
-
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
 	       struct perf_cpu_context *cpuctx,
@@ -5382,8 +5378,6 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 	spin_lock(&perf_resource_lock);
 	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
 	spin_unlock(&perf_resource_lock);
-
-	hw_perf_event_setup(cpu);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5423,20 +5417,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 		perf_event_init_cpu(cpu);
 		break;
 
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hw_perf_event_setup_online(cpu);
-		break;
-
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		perf_event_exit_cpu(cpu);
 		break;
 
-	case CPU_DEAD:
-		hw_perf_event_setup_offline(cpu);
-		break;
-
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 32975a4f114be52286f9a5bf6c230dbb8c0e1903 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 19:49:19 +0100
Subject: perf: Optimize perf_disable

Currently we always call hw_perf_disable(), even if its already disabled,
this seems superflous, esp. since it cannot be made NMI safe (see further
patches).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 73329dedb5ad..d8108465397d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -93,25 +93,15 @@ void __weak perf_event_print_debug(void)	{ }
 
 static DEFINE_PER_CPU(int, perf_disable_count);
 
-void __perf_disable(void)
-{
-	__get_cpu_var(perf_disable_count)++;
-}
-
-bool __perf_enable(void)
-{
-	return !--__get_cpu_var(perf_disable_count);
-}
-
 void perf_disable(void)
 {
-	__perf_disable();
-	hw_perf_disable();
+	if (!__get_cpu_var(perf_disable_count)++)
+		hw_perf_disable();
 }
 
 void perf_enable(void)
 {
-	if (__perf_enable())
+	if (!--__get_cpu_var(perf_disable_count))
 		hw_perf_enable();
 }
 
-- 
cgit v1.2.3


From d4944a06666054707d23e11888e480af239e5abf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 13:51:20 +0100
Subject: perf: Provide better condition for event rotation

Try to avoid useless rotation and PMU disables.

[ Could be improved by keeping a nr_runnable count to better account
  for the < PERF_STAT_INACTIVE counters ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index d8108465397d..52c69a34d697 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1524,12 +1524,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(event, 1);
+			perf_disable();
 			event->pmu->unthrottle(event);
+			perf_enable();
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
+		perf_disable();
 		event->pmu->read(event);
 		now = atomic64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
@@ -1537,6 +1540,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 
 		if (delta > 0)
 			perf_adjust_period(event, TICK_NSEC, delta);
+		perf_enable();
 	}
 	raw_spin_unlock(&ctx->lock);
 }
@@ -1546,9 +1550,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
  */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-	if (!ctx->nr_events)
-		return;
-
 	raw_spin_lock(&ctx->lock);
 
 	/* Rotate the first entry last of non-pinned groups */
@@ -1561,19 +1562,28 @@ void perf_event_task_tick(struct task_struct *curr)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
+	int rotate = 0;
 
 	if (!atomic_read(&nr_events))
 		return;
 
 	cpuctx = &__get_cpu_var(perf_cpu_context);
-	ctx = curr->perf_event_ctxp;
+	if (cpuctx->ctx.nr_events &&
+	    cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+		rotate = 1;
 
-	perf_disable();
+	ctx = curr->perf_event_ctxp;
+	if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+		rotate = 1;
 
 	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
 		perf_ctx_adjust_freq(ctx);
 
+	if (!rotate)
+		return;
+
+	perf_disable();
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
 		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1585,7 +1595,6 @@ void perf_event_task_tick(struct task_struct *curr)
 	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
 		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-
 	perf_enable();
 }
 
-- 
cgit v1.2.3


From db2c4c7791cd04512093d05afc693c3511a65fd7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 2 Feb 2010 23:34:40 +0100
Subject: lockdep: Move lock events under lockdep recursion protection

There are rcu locked read side areas in the path where we submit
a trace event. And these rcu_read_(un)lock() trigger lock events,
which create recursive events.

One pair in do_perf_sw_event:

__lock_acquire
      |
      |--96.11%-- lock_acquire
      |          |
      |          |--27.21%-- do_perf_sw_event
      |          |          perf_tp_event
      |          |          |
      |          |          |--49.62%-- ftrace_profile_lock_release
      |          |          |          lock_release
      |          |          |          |
      |          |          |          |--33.85%-- _raw_spin_unlock

Another pair in perf_output_begin/end:

__lock_acquire
      |--23.40%-- perf_output_begin
      |          |          __perf_event_overflow
      |          |          perf_swevent_overflow
      |          |          perf_swevent_add
      |          |          perf_swevent_ctx_event
      |          |          do_perf_sw_event
      |          |          perf_tp_event
      |          |          |
      |          |          |--55.37%-- ftrace_profile_lock_acquire
      |          |          |          lock_acquire
      |          |          |          |
      |          |          |          |--37.31%-- _raw_spin_lock

The problem is not that much the trace recursion itself, as we have a
recursion protection already (though it's always wasteful to recurse).
But the trace events are outside the lockdep recursion protection, then
each lockdep event triggers a lock trace, which will trigger two
other lockdep events. Here the recursive lock trace event won't
be taken because of the trace recursion, so the recursion stops there
but lockdep will still analyse these new events:

To sum up, for each lockdep events we have:

	lock_*()
	     |
             trace lock_acquire
                  |
                  ----- rcu_read_lock()
                  |          |
                  |          lock_acquire()
                  |          |
                  |          trace_lock_acquire() (stopped)
                  |          |
		  |          lockdep analyze
                  |
                  ----- rcu_read_unlock()
                             |
                             lock_release
                             |
                             trace_lock_release() (stopped)
                             |
                             lockdep analyze

And you can repeat the above two times as we have two rcu read side
sections when we submit an event.

This is fixed in this patch by moving the lock trace event under
the lockdep recursion protection.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/lockdep.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0c30d0455de1..65b5f5b7c298 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3211,8 +3211,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
-	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
-
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -3220,6 +3218,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
+	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
 	__lock_acquire(lock, subclass, trylock, read, check,
 		       irqs_disabled_flags(flags), nest_lock, ip, 0);
 	current->lockdep_recursion = 0;
@@ -3232,14 +3231,13 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
 	unsigned long flags;
 
-	trace_lock_release(lock, nested, ip);
-
 	if (unlikely(current->lockdep_recursion))
 		return;
 
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
+	trace_lock_release(lock, nested, ip);
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
@@ -3413,8 +3411,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
-	trace_lock_contended(lock, ip);
-
 	if (unlikely(!lock_stat))
 		return;
 
@@ -3424,6 +3420,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
+	trace_lock_contended(lock, ip);
 	__lock_contended(lock, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
-- 
cgit v1.2.3


From 5331d7b84613b8325362dde53dc2bff2fb87d351 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Mar 2010 21:15:56 +0100
Subject: perf: Introduce new perf_fetch_caller_regs() for hot regs snapshot

Events that trigger overflows by interrupting a context can
use get_irq_regs() or task_pt_regs() to retrieve the state
when the event triggered. But this is not the case for some
other class of events like trace events as tracepoints are
executed in the same context than the code that triggered
the event.

It means we need a different api to capture the regs there,
namely we need a hot snapshot to get the most important
informations for perf: the instruction pointer to get the
event origin, the frame pointer for the callchain, the code
segment for user_mode() tests (we always use __KERNEL_CS as
trace events always occur from the kernel) and the eflags
for further purposes.

v2: rename perf_save_regs to perf_fetch_caller_regs as per
Masami's suggestion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Archs <linux-arch@vger.kernel.org>
---
 kernel/perf_event.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 52c69a34d697..359d7f690c2b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2786,6 +2786,11 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
+__weak
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
+{
+}
+
 /*
  * Output
  */
-- 
cgit v1.2.3


From c530665c31c0140b74ca7689e7f836177796e5bd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Mar 2010 07:16:16 +0100
Subject: perf: Take a hot regs snapshot for trace events

We are taking a wrong regs snapshot when a trace event triggers.
Either we use get_irq_regs(), which gives us the interrupted
registers if we are in an interrupt, or we use task_pt_regs()
which gives us the state before we entered the kernel, assuming
we are lucky enough to be no kernel thread, in which case
task_pt_regs() returns the initial set of regs when the kernel
thread was started.

What we want is different. We need a hot snapshot of the regs,
so that we can get the instruction pointer to record in the
sample, the frame pointer for the callchain, and some other
things.

Let's use the new perf_fetch_caller_regs() for that.

Comparison with perf record -e lock: -R -a -f -g
Before:

        perf  [kernel]                   [k] __do_softirq
               |
               --- __do_softirq
                  |
                  |--55.16%-- __open
                  |
                   --44.84%-- __write_nocancel

After:

            perf  [kernel]           [k] perf_tp_event
               |
               --- perf_tp_event
                  |
                  |--41.07%-- lock_acquire
                  |          |
                  |          |--39.36%-- _raw_spin_lock
                  |          |          |
                  |          |          |--7.81%-- hrtimer_interrupt
                  |          |          |          smp_apic_timer_interrupt
                  |          |          |          apic_timer_interrupt

The old case was producing unreliable callchains. Now having
right frame and instruction pointers, we have the trace we
want.

Also syscalls and kprobe events already have the right regs,
let's use them instead of wasting a retrieval.

v2: Follow the rename perf_save_regs() -> perf_fetch_caller_regs()

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Archs <linux-arch@vger.kernel.org>
---
 kernel/perf_event.c                | 8 ++------
 kernel/trace/trace_event_profile.c | 3 ++-
 kernel/trace/trace_kprobe.c        | 5 +++--
 kernel/trace/trace_syscalls.c      | 4 ++--
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 359d7f690c2b..45b4b6e55891 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4318,9 +4318,8 @@ static const struct pmu perf_ops_task_clock = {
 #ifdef CONFIG_EVENT_TRACING
 
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-			  int entry_size)
+		   int entry_size, struct pt_regs *regs)
 {
-	struct pt_regs *regs = get_irq_regs();
 	struct perf_sample_data data;
 	struct perf_raw_record raw = {
 		.size = entry_size,
@@ -4330,12 +4329,9 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 	perf_sample_data_init(&data, addr);
 	data.raw = &raw;
 
-	if (!regs)
-		regs = task_pt_regs(current);
-
 	/* Trace events already protected against recursion */
 	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-				&data, regs);
+			 &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index f0d693005075..e66d21e15a0f 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -2,13 +2,14 @@
  * trace event based perf counter profiling
  *
  * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include "trace.h"
 
+DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
 
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 505c92273b1a..f7a20a8bfb31 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1240,7 +1240,7 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
+	ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
 }
 
 /* Kretprobe profile handler */
@@ -1271,7 +1271,8 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
+	ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+			       irq_flags, regs);
 }
 
 static int probe_profile_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cba47d7935cc..7e6e84fb7b6c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -467,7 +467,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
 
 int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -542,7 +542,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
 
 int prof_sysexit_enable(struct ftrace_event_call *call)
-- 
cgit v1.2.3


From 97d5a22005f38057b4bc0d95f81cd26510268794 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 5 Mar 2010 05:35:37 +0100
Subject: perf: Drop the obsolete profile naming for trace events

Drop the obsolete "profile" naming used by perf for trace events.
Perf can now do more than simple events counting, so generalize
the API naming.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
---
 kernel/perf_event.c                |   4 +-
 kernel/trace/Makefile              |   2 +-
 kernel/trace/trace_event_perf.c    | 165 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_event_profile.c | 165 -------------------------------------
 kernel/trace/trace_events.c        |   2 +-
 kernel/trace/trace_kprobe.c        |  28 +++----
 kernel/trace/trace_syscalls.c      |  72 ++++++++--------
 7 files changed, 219 insertions(+), 219 deletions(-)
 create mode 100644 kernel/trace/trace_event_perf.c
 delete mode 100644 kernel/trace/trace_event_profile.c

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 45b4b6e55891..c502b18594cc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4347,7 +4347,7 @@ static int perf_tp_event_match(struct perf_event *event,
 
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-	ftrace_profile_disable(event->attr.config);
+	perf_trace_disable(event->attr.config);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4361,7 +4361,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 			!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (ftrace_profile_enable(event->attr.config))
+	if (perf_trace_enable(event->attr.config))
 		return NULL;
 
 	event->destroy = tp_perf_event_destroy;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe23f54..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
-obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 000000000000..f315b12a41d8
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,165 @@
+/*
+ * trace event based perf event profiling/tracing
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include "trace.h"
+
+DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
+
+static char *perf_trace_buf;
+static char *perf_trace_buf_nmi;
+
+typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ;
+
+/* Count the events in use (per event id, not per instance) */
+static int	total_ref_count;
+
+static int perf_trace_event_enable(struct ftrace_event_call *event)
+{
+	char *buf;
+	int ret = -ENOMEM;
+
+	if (event->perf_refcount++ > 0)
+		return 0;
+
+	if (!total_ref_count) {
+		buf = (char *)alloc_percpu(perf_trace_t);
+		if (!buf)
+			goto fail_buf;
+
+		rcu_assign_pointer(perf_trace_buf, buf);
+
+		buf = (char *)alloc_percpu(perf_trace_t);
+		if (!buf)
+			goto fail_buf_nmi;
+
+		rcu_assign_pointer(perf_trace_buf_nmi, buf);
+	}
+
+	ret = event->perf_event_enable(event);
+	if (!ret) {
+		total_ref_count++;
+		return 0;
+	}
+
+fail_buf_nmi:
+	if (!total_ref_count) {
+		free_percpu(perf_trace_buf_nmi);
+		free_percpu(perf_trace_buf);
+		perf_trace_buf_nmi = NULL;
+		perf_trace_buf = NULL;
+	}
+fail_buf:
+	event->perf_refcount--;
+
+	return ret;
+}
+
+int perf_trace_enable(int event_id)
+{
+	struct ftrace_event_call *event;
+	int ret = -EINVAL;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(event, &ftrace_events, list) {
+		if (event->id == event_id && event->perf_event_enable &&
+		    try_module_get(event->mod)) {
+			ret = perf_trace_event_enable(event);
+			break;
+		}
+	}
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static void perf_trace_event_disable(struct ftrace_event_call *event)
+{
+	char *buf, *nmi_buf;
+
+	if (--event->perf_refcount > 0)
+		return;
+
+	event->perf_event_disable(event);
+
+	if (!--total_ref_count) {
+		buf = perf_trace_buf;
+		rcu_assign_pointer(perf_trace_buf, NULL);
+
+		nmi_buf = perf_trace_buf_nmi;
+		rcu_assign_pointer(perf_trace_buf_nmi, NULL);
+
+		/*
+		 * Ensure every events in profiling have finished before
+		 * releasing the buffers
+		 */
+		synchronize_sched();
+
+		free_percpu(buf);
+		free_percpu(nmi_buf);
+	}
+}
+
+void perf_trace_disable(int event_id)
+{
+	struct ftrace_event_call *event;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(event, &ftrace_events, list) {
+		if (event->id == event_id) {
+			perf_trace_event_disable(event);
+			module_put(event->mod);
+			break;
+		}
+	}
+	mutex_unlock(&event_mutex);
+}
+
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+				       int *rctxp, unsigned long *irq_flags)
+{
+	struct trace_entry *entry;
+	char *trace_buf, *raw_data;
+	int pc, cpu;
+
+	pc = preempt_count();
+
+	/* Protect the per cpu buffer, begin the rcu read side */
+	local_irq_save(*irq_flags);
+
+	*rctxp = perf_swevent_get_recursion_context();
+	if (*rctxp < 0)
+		goto err_recursion;
+
+	cpu = smp_processor_id();
+
+	if (in_nmi())
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
+	else
+		trace_buf = rcu_dereference(perf_trace_buf);
+
+	if (!trace_buf)
+		goto err;
+
+	raw_data = per_cpu_ptr(trace_buf, cpu);
+
+	/* zero the dead bytes from align to not leak stack to user */
+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+	entry = (struct trace_entry *)raw_data;
+	tracing_generic_entry_update(entry, *irq_flags, pc);
+	entry->type = type;
+
+	return raw_data;
+err:
+	perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+	local_irq_restore(*irq_flags);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
deleted file mode 100644
index e66d21e15a0f..000000000000
--- a/kernel/trace/trace_event_profile.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * trace event based perf counter profiling
- *
- * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include "trace.h"
-
-DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
-
-static char *perf_trace_buf;
-static char *perf_trace_buf_nmi;
-
-typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
-
-/* Count the events in use (per event id, not per instance) */
-static int	total_profile_count;
-
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
-{
-	char *buf;
-	int ret = -ENOMEM;
-
-	if (event->profile_count++ > 0)
-		return 0;
-
-	if (!total_profile_count) {
-		buf = (char *)alloc_percpu(perf_trace_t);
-		if (!buf)
-			goto fail_buf;
-
-		rcu_assign_pointer(perf_trace_buf, buf);
-
-		buf = (char *)alloc_percpu(perf_trace_t);
-		if (!buf)
-			goto fail_buf_nmi;
-
-		rcu_assign_pointer(perf_trace_buf_nmi, buf);
-	}
-
-	ret = event->profile_enable(event);
-	if (!ret) {
-		total_profile_count++;
-		return 0;
-	}
-
-fail_buf_nmi:
-	if (!total_profile_count) {
-		free_percpu(perf_trace_buf_nmi);
-		free_percpu(perf_trace_buf);
-		perf_trace_buf_nmi = NULL;
-		perf_trace_buf = NULL;
-	}
-fail_buf:
-	event->profile_count--;
-
-	return ret;
-}
-
-int ftrace_profile_enable(int event_id)
-{
-	struct ftrace_event_call *event;
-	int ret = -EINVAL;
-
-	mutex_lock(&event_mutex);
-	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id && event->profile_enable &&
-		    try_module_get(event->mod)) {
-			ret = ftrace_profile_enable_event(event);
-			break;
-		}
-	}
-	mutex_unlock(&event_mutex);
-
-	return ret;
-}
-
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
-{
-	char *buf, *nmi_buf;
-
-	if (--event->profile_count > 0)
-		return;
-
-	event->profile_disable(event);
-
-	if (!--total_profile_count) {
-		buf = perf_trace_buf;
-		rcu_assign_pointer(perf_trace_buf, NULL);
-
-		nmi_buf = perf_trace_buf_nmi;
-		rcu_assign_pointer(perf_trace_buf_nmi, NULL);
-
-		/*
-		 * Ensure every events in profiling have finished before
-		 * releasing the buffers
-		 */
-		synchronize_sched();
-
-		free_percpu(buf);
-		free_percpu(nmi_buf);
-	}
-}
-
-void ftrace_profile_disable(int event_id)
-{
-	struct ftrace_event_call *event;
-
-	mutex_lock(&event_mutex);
-	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id) {
-			ftrace_profile_disable_event(event);
-			module_put(event->mod);
-			break;
-		}
-	}
-	mutex_unlock(&event_mutex);
-}
-
-__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
-					int *rctxp, unsigned long *irq_flags)
-{
-	struct trace_entry *entry;
-	char *trace_buf, *raw_data;
-	int pc, cpu;
-
-	pc = preempt_count();
-
-	/* Protect the per cpu buffer, begin the rcu read side */
-	local_irq_save(*irq_flags);
-
-	*rctxp = perf_swevent_get_recursion_context();
-	if (*rctxp < 0)
-		goto err_recursion;
-
-	cpu = smp_processor_id();
-
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto err;
-
-	raw_data = per_cpu_ptr(trace_buf, cpu);
-
-	/* zero the dead bytes from align to not leak stack to user */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-
-	entry = (struct trace_entry *)raw_data;
-	tracing_generic_entry_update(entry, *irq_flags, pc);
-	entry->type = type;
-
-	return raw_data;
-err:
-	perf_swevent_put_recursion_context(*rctxp);
-err_recursion:
-	local_irq_restore(*irq_flags);
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3f972ad98d04..beab8bf2f310 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -938,7 +938,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		trace_create_file("enable", 0644, call->dir, call,
 				  enable);
 
-	if (call->id && call->profile_enable)
+	if (call->id && call->perf_event_enable)
 		trace_create_file("id", 0444, call->dir, call,
 		 		  id);
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f7a20a8bfb31..1251e367bae9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1214,7 +1214,7 @@ static int set_print_fmt(struct trace_probe *tp)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static __kprobes void kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
 					 struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1227,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		     "profile buffer not large enough"))
 		return;
 
-	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+	entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
 	if (!entry)
 		return;
 
@@ -1240,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
+	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
 }
 
 /* Kretprobe profile handler */
-static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 					    struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1257,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
 	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		     "profile buffer not large enough"))
 		return;
 
-	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+	entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
 	if (!entry)
 		return;
 
@@ -1271,11 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
 			       irq_flags, regs);
 }
 
-static int probe_profile_enable(struct ftrace_event_call *call)
+static int probe_perf_enable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
@@ -1287,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
 		return enable_kprobe(&tp->rp.kp);
 }
 
-static void probe_profile_disable(struct ftrace_event_call *call)
+static void probe_perf_disable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
@@ -1312,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(kp, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kprobe_profile_func(kp, regs);
+		kprobe_perf_func(kp, regs);
 #endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
@@ -1326,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 		kretprobe_trace_func(ri, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kretprobe_profile_func(ri, regs);
+		kretprobe_perf_func(ri, regs);
 #endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
@@ -1359,8 +1359,8 @@ static int register_probe_event(struct trace_probe *tp)
 	call->unregfunc = probe_event_disable;
 
 #ifdef CONFIG_PERF_EVENTS
-	call->profile_enable = probe_profile_enable;
-	call->profile_disable = probe_profile_disable;
+	call->perf_event_enable = probe_perf_enable;
+	call->perf_event_disable = probe_perf_disable;
 #endif
 	call->data = tp;
 	ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7e6e84fb7b6c..33c2a5b769dc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -428,12 +428,12 @@ core_initcall(init_ftrace_syscalls);
 
 #ifdef CONFIG_PERF_EVENTS
 
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
-static int sys_prof_refcount_enter;
-static int sys_prof_refcount_exit;
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
+static int sys_perf_refcount_enter;
+static int sys_perf_refcount_exit;
 
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
@@ -443,7 +443,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	int size;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -455,11 +455,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-		      "profile buffer not large enough"))
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+		      "perf buffer not large enough"))
 		return;
 
-	rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
 				sys_data->enter_event->id, &rctx, &flags);
 	if (!rec)
 		return;
@@ -467,10 +467,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
 
-int prof_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -478,34 +478,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_prof_refcount_enter)
-		ret = register_trace_sys_enter(prof_syscall_enter);
+	if (!sys_perf_refcount_enter)
+		ret = register_trace_sys_enter(perf_syscall_enter);
 	if (ret) {
 		pr_info("event trace: Could not activate"
 				"syscall entry trace point");
 	} else {
-		set_bit(num, enabled_prof_enter_syscalls);
-		sys_prof_refcount_enter++;
+		set_bit(num, enabled_perf_enter_syscalls);
+		sys_perf_refcount_enter++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-void prof_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	sys_prof_refcount_enter--;
-	clear_bit(num, enabled_prof_enter_syscalls);
-	if (!sys_prof_refcount_enter)
-		unregister_trace_sys_enter(prof_syscall_enter);
+	sys_perf_refcount_enter--;
+	clear_bit(num, enabled_perf_enter_syscalls);
+	if (!sys_perf_refcount_enter)
+		unregister_trace_sys_enter(perf_syscall_enter);
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
@@ -515,7 +515,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	int size;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -530,11 +530,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	 * Impossible, but be paranoid with the future
 	 * How to put this check outside runtime?
 	 */
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-		"exit event has grown above profile buffer size"))
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+		"exit event has grown above perf buffer size"))
 		return;
 
-	rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
 				sys_data->exit_event->id, &rctx, &flags);
 	if (!rec)
 		return;
@@ -542,10 +542,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
 
-int prof_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -553,30 +553,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_prof_refcount_exit)
-		ret = register_trace_sys_exit(prof_syscall_exit);
+	if (!sys_perf_refcount_exit)
+		ret = register_trace_sys_exit(perf_syscall_exit);
 	if (ret) {
 		pr_info("event trace: Could not activate"
 				"syscall exit trace point");
 	} else {
-		set_bit(num, enabled_prof_exit_syscalls);
-		sys_prof_refcount_exit++;
+		set_bit(num, enabled_perf_exit_syscalls);
+		sys_perf_refcount_exit++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-void prof_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	sys_prof_refcount_exit--;
-	clear_bit(num, enabled_prof_exit_syscalls);
-	if (!sys_prof_refcount_exit)
-		unregister_trace_sys_exit(prof_syscall_exit);
+	sys_perf_refcount_exit--;
+	clear_bit(num, enabled_perf_exit_syscalls);
+	if (!sys_perf_refcount_exit)
+		unregister_trace_sys_exit(perf_syscall_exit);
 	mutex_unlock(&syscall_trace_lock);
 }
 
-- 
cgit v1.2.3


From 0b1adaa031a55e44f5dd942f234bf09d28e8a0d6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2010 19:45:54 +0100
Subject: genirq: Prevent oneshot irq thread race

Lars-Peter pointed out that the oneshot threaded interrupt handler
code has the following race:

 CPU0                            CPU1
 hande_level_irq(irq X)
   mask_ack_irq(irq X)
   handle_IRQ_event(irq X)
     wake_up(thread_handler)
                                 thread handler(irq X) runs
                                 finalize_oneshot(irq X)
				  does not unmask due to
				  !(desc->status & IRQ_MASKED)

 return from irq
 does not unmask due to
 (desc->status & IRQ_ONESHOT)

This leaves the interrupt line masked forever.

The reason for this is the inconsistent handling of the IRQ_MASKED
flag. Instead of setting it in the mask function the oneshot support
sets the flag after waking up the irq thread.

The solution for this is to set/clear the IRQ_MASKED status whenever
we mask/unmask an interrupt line. That's the easy part, but that
cleanup opens another race:

 CPU0                            CPU1
 hande_level_irq(irq)
   mask_ack_irq(irq)
   handle_IRQ_event(irq)
     wake_up(thread_handler)
                                 thread handler(irq) runs
                                 finalize_oneshot_irq(irq)
				  unmask(irq)
     irq triggers again
     handle_level_irq(irq)
       mask_ack_irq(irq)
     return from irq due to IRQ_INPROGRESS

 return from irq
 does not unmask due to
 (desc->status & IRQ_ONESHOT)

This requires that we synchronize finalize_oneshot_irq() with the
primary handler. If IRQ_INPROGESS is set we wait until the primary
handler on the other CPU has returned before unmasking the interrupt
line again.

We probably have never seen that problem because it does not happen on
UP and on SMP the irqbalancer protects us by pinning the primary
handler and the thread to the same CPU.

Reported-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 kernel/irq/chip.c   | 31 ++++++++++++++++++++++---------
 kernel/irq/manage.c | 18 ++++++++++++++++++
 2 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d70394f12ee9..71eba24a39a2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
 		if (desc->chip->ack)
 			desc->chip->ack(irq);
 	}
+	desc->status |= IRQ_MASKED;
+}
+
+static inline void mask_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->mask) {
+		desc->chip->mask(irq);
+		desc->status |= IRQ_MASKED;
+	}
+}
+
+static inline void unmask_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->unmask) {
+		desc->chip->unmask(irq);
+		desc->status &= ~IRQ_MASKED;
+	}
 }
 
 /*
@@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 
-	if (unlikely(desc->status & IRQ_ONESHOT))
-		desc->status |= IRQ_MASKED;
-	else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-		desc->chip->unmask(irq);
+	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+		unmask_irq(desc, irq);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
@@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	action = desc->action;
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
-		if (desc->chip->mask)
-			desc->chip->mask(irq);
+		mask_irq(desc, irq);
 		goto out;
 	}
 
@@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
-			desc->chip->mask(irq);
+			mask_irq(desc, irq);
 			goto out_unlock;
 		}
 
@@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
-			desc->chip->unmask(irq);
-			desc->status &= ~IRQ_MASKED;
+			unmask_irq(desc, irq);
 		}
 
 		desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..69a3d7b9414c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -483,8 +483,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
  */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+again:
 	chip_bus_lock(irq, desc);
 	raw_spin_lock_irq(&desc->lock);
+
+	/*
+	 * Implausible though it may be we need to protect us against
+	 * the following scenario:
+	 *
+	 * The thread is faster done than the hard interrupt handler
+	 * on the other CPU. If we unmask the irq line then the
+	 * interrupt can come in again and masks the line, leaves due
+	 * to IRQ_INPROGRESS and the irq line is masked forever.
+	 */
+	if (unlikely(desc->status & IRQ_INPROGRESS)) {
+		raw_spin_unlock_irq(&desc->lock);
+		chip_bus_sync_unlock(irq, desc);
+		cpu_relax();
+		goto again;
+	}
+
 	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
 		desc->status &= ~IRQ_MASKED;
 		desc->chip->unmask(irq);
-- 
cgit v1.2.3


From 220b140b52ab6cc133f674a7ffec8fa792054f25 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 10 Mar 2010 20:45:52 +1100
Subject: perf_event: Fix oops triggered by cpu offline/online

Anton Blanchard found that he could reliably make the kernel hit a
BUG_ON in the slab allocator by taking a cpu offline and then online
while a system-wide perf record session was running.

The reason is that when the cpu comes up, we completely reinitialize
the ctx field of the struct perf_cpu_context for the cpu.  If there is
a system-wide perf record session running, then there will be a struct
perf_event that has a reference to the context, so its refcount will
be 2.  (The perf_event has been removed from the context's group_entry
and event_entry lists by perf_event_exit_cpu(), but that doesn't
remove the perf_event's reference to the context and doesn't decrement
the context's refcount.)

When the cpu comes up, perf_event_init_cpu() gets called, and it calls
__perf_event_init_context() on the cpu's context.  That resets the
refcount to 1.  Then when the perf record session finishes and the
perf_event is closed, the refcount gets decremented to 0 and the
context gets kfreed after an RCU grace period.  Since the context
wasn't kmalloced -- it's part of a per-cpu variable -- bad things
happen.

In fact we don't need to completely reinitialize the context when the
cpu comes up.  It's sufficient to initialize the context once at boot,
but we need to do it for all possible cpus.

This moves the context initialization to happen at boot time.  With
this, we don't trash the refcount and the context never gets kfreed,
and we don't hit the BUG_ON.

Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Tested-by: Anton Blanchard <anton@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c502b18594cc..fb3031cf9f17 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5368,12 +5368,22 @@ int perf_event_init_task(struct task_struct *child)
 	return ret;
 }
 
+static void __init perf_event_init_all_cpus(void)
+{
+	int cpu;
+	struct perf_cpu_context *cpuctx;
+
+	for_each_possible_cpu(cpu) {
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		__perf_event_init_context(&cpuctx->ctx, NULL);
+	}
+}
+
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	__perf_event_init_context(&cpuctx->ctx, NULL);
 
 	spin_lock(&perf_resource_lock);
 	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
@@ -5439,6 +5449,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 
 void __init perf_event_init(void)
 {
+	perf_event_init_all_cpus();
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-- 
cgit v1.2.3


From 83ff56f46a8532488ee364bb93a9cb2a59490d33 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 9 Mar 2010 10:22:19 -0500
Subject: kprobes: Calculate the index correctly when freeing the out-of-line
 execution slot

From : Ananth N Mavinakayanahalli <ananth@in.ibm.com>

When freeing the instruction slot, the arithmetic to calculate
the index of the slot in the page needs to account for the total
size of the instruction on the various architectures.

Calculate the index correctly when freeing the out-of-line
execution slot.

Reported-by: Sachin Sant <sachinp@in.ibm.com>
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
LKML-Reference: <4B9667AB.9050507@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fa034d29cf73..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -259,7 +259,8 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
 	struct kprobe_insn_page *kip;
 
 	list_for_each_entry(kip, &c->pages, list) {
-		long idx = ((long)slot - (long)kip->insns) / c->insn_size;
+		long idx = ((long)slot - (long)kip->insns) /
+				(c->insn_size * sizeof(kprobe_opcode_t));
 		if (idx >= 0 && idx < slots_per_page(c)) {
 			WARN_ON(kip->slot_used[idx] != SLOT_USED);
 			if (dirty) {
-- 
cgit v1.2.3


From 639fe4b12f92b54c9c3b38c82cdafaa38cfd3e63 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 11 Mar 2010 15:30:35 +0800
Subject: perf: export perf_trace_regs and perf_arch_fetch_caller_regs

Export perf_trace_regs and perf_arch_fetch_caller_regs since module will
use these.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
[ use EXPORT_PER_CPU_SYMBOL_GPL() ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4B989C1B.2090407@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_event_perf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index f315b12a41d8..0709e4f75114 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -10,6 +10,7 @@
 #include "trace.h"
 
 DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
+EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
 
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
-- 
cgit v1.2.3


From 80a05b9ffa7dc13f6693902dd8999a2b61a3a0d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 12 Mar 2010 17:34:14 +0100
Subject: clockevents: Sanitize min_delta_ns adjustment and prevent overflows

The current logic which handles clock events programming failures can
increase min_delta_ns unlimited and even can cause overflows.

Sanitize it by:
 - prevent zero increase when min_delta_ns == 1
 - limiting min_delta_ns to a jiffie
 - bail out if the jiffie limit is hit
 - add retries stats for /proc/timer_list so we can gather data

Reported-by: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-oneshot.c | 52 +++++++++++++++++++++++++++++++++++-----------
 kernel/time/timer_list.c   |  3 ++-
 2 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
 
 #include "tick-internal.h"
 
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
+
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+	/* Nothing to do if we already reached the limit */
+	if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+		return -ETIME;
+
+	if (dev->min_delta_ns < 5000)
+		dev->min_delta_ns = 5000;
+	else
+		dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+	if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+		dev->min_delta_ns = MIN_DELTA_LIMIT;
+
+	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+	       dev->name ? dev->name : "?",
+	       (unsigned long long) dev->min_delta_ns);
+	return 0;
+}
+
 /**
  * tick_program_event internal worker function
  */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 		if (!ret || !force)
 			return ret;
 
+		dev->retries++;
 		/*
-		 * We tried 2 times to program the device with the given
-		 * min_delta_ns. If that's not working then we double it
+		 * We tried 3 times to program the device with the given
+		 * min_delta_ns. If that's not working then we increase it
 		 * and emit a warning.
 		 */
 		if (++i > 2) {
 			/* Increase the min. delta and try again */
-			if (!dev->min_delta_ns)
-				dev->min_delta_ns = 5000;
-			else
-				dev->min_delta_ns += dev->min_delta_ns >> 1;
-
-			printk(KERN_WARNING
-			       "CE: %s increasing min_delta_ns to %llu nsec\n",
-			       dev->name ? dev->name : "?",
-			       (unsigned long long) dev->min_delta_ns << 1);
-
+			if (tick_increase_min_delta(dev)) {
+				/*
+				 * Get out of the loop if min_delta_ns
+				 * hit the limit already. That's
+				 * better than staying here forever.
+				 *
+				 * We clear next_event so we have a
+				 * chance that the box survives.
+				 */
+				printk(KERN_WARNING
+				       "CE: Reprogramming failure. Giving up\n");
+				dev->next_event.tv64 = KTIME_MAX;
+				return -ETIME;
+			}
 			i = 0;
 		}
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..1a4a7dd78777 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	SEQ_printf(m, " event_handler:  ");
 	print_name_offset(m, dev->event_handler);
 	SEQ_printf(m, "\n");
+	SEQ_printf(m, " retries:        %lu\n", dev->retries);
 }
 
 static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Timer List Version: v0.5\n");
+	SEQ_printf(m, "Timer List Version: v0.6\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 
-- 
cgit v1.2.3


From 829b6c1ef488856c6a46a2f705f5068062d5f34c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 11 Mar 2010 14:04:30 -0800
Subject: timer stats: Fix del_timer_sync() and try_to_del_timer_sync()

These functions forgot to run timer_stats_timer_clear_start_info().  It's
unobvious what effect this has and whether it matters much - we won't be
printing it out anyway if the timer's detached.

Untested, just an Ingo trollpatch.

[ Nevertheless correct - tglx ]

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..fc965eae0e87 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -880,6 +880,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 	if (base->running_timer == timer)
 		goto out;
 
+	timer_stats_timer_clear_start_info(timer);
 	ret = 0;
 	if (timer_pending(timer)) {
 		detach_timer(timer, 1);
-- 
cgit v1.2.3


From 15365c108ea27598e265f8c13e7051d99ca5b0b9 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Thu, 11 Mar 2010 14:04:31 -0800
Subject: posix-cpu-timers: Reset expire cache when no timer is running

When a process deletes cpu timer or a timer expires we do not clear
the expiration cache sig->cputimer_expires.

As a result the fastpath_timer_check() which prevents us to loop over
all threads in case no timer is active is not working and we run the
slow path needlessly on every tick.

Zero sig->cputimer_expires in stop_process_timers().

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Spencer Candland <spencer@bluehost.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..edec25a68e9e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1060,9 +1060,9 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
-static void stop_process_timers(struct task_struct *tsk)
+static void stop_process_timers(struct signal_struct *sig)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = &sig->cputimer;
 	unsigned long flags;
 
 	if (!cputimer->running)
@@ -1071,6 +1071,10 @@ static void stop_process_timers(struct task_struct *tsk)
 	spin_lock_irqsave(&cputimer->lock, flags);
 	cputimer->running = 0;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
+
+	sig->cputime_expires.prof_exp = cputime_zero;
+	sig->cputime_expires.virt_exp = cputime_zero;
+	sig->cputime_expires.sched_exp = 0;
 }
 
 static u32 onecputick;
@@ -1131,7 +1135,7 @@ static void check_process_timers(struct task_struct *tsk,
 	    list_empty(&timers[CPUCLOCK_VIRT]) &&
 	    cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
 	    list_empty(&timers[CPUCLOCK_SCHED])) {
-		stop_process_timers(tsk);
+		stop_process_timers(sig);
 		return;
 	}
 
-- 
cgit v1.2.3


From cd3d8031eb4311e516329aee03c79a08333141f1 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 12 Mar 2010 16:15:36 +0900
Subject: sched: sched_getaffinity(): Allow less than NR_CPUS length

[ Note, this commit changes the syscall ABI for > 1024 CPUs systems. ]

Recently, some distro decided to use NR_CPUS=4096 for mysterious reasons.
Unfortunately, glibc sched interface has the following definition:

	# define __CPU_SETSIZE  1024
	# define __NCPUBITS     (8 * sizeof (__cpu_mask))
	typedef unsigned long int __cpu_mask;
	typedef struct
	{
	  __cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS];
	} cpu_set_t;

It mean, if NR_CPUS is bigger than 1024, cpu_set_t makes an
ABI issue ...

More recently, Sharyathi Nagesh reported following test program makes
misterious syscall failure:

 -----------------------------------------------------------------------
 #define _GNU_SOURCE
 #include<stdio.h>
 #include<errno.h>
 #include<sched.h>

 int main()
 {
     cpu_set_t set;
     if (sched_getaffinity(0, sizeof(cpu_set_t), &set) < 0)
         printf("\n Call is failing with:%d", errno);
 }
 -----------------------------------------------------------------------

Because the kernel assumes len argument of sched_getaffinity() is bigger
than NR_CPUS. But now it is not correct.

Now we are faced with the following annoying dilemma, due to
the limitations of the glibc interface built in years ago:

 (1) if we change glibc's __CPU_SETSIZE definition, we lost
     binary compatibility of _all_ application.

 (2) if we don't change it, we also lost binary compatibility of
     Sharyathi's use case.

Then, I would propse to change the rule of the len argument of
sched_getaffinity().

Old:
	len should be bigger than NR_CPUS
New:
	len should be bigger than maximum possible cpu id

This creates the following behavior:

 (A) In the real 4096 cpus machine, the above test program still
     return -EINVAL.

 (B) NR_CPUS=4096 but the machine have less than 1024 cpus (almost
     all machines in the world), the above can run successfully.

Fortunatelly, BIG SGI machine is mainly used for HPC use case. It means
they can rebuild their programs.

IOW we hope they are not annoyed by this issue ...

Reported-by: Sharyathi Nagesh <sharyath@in.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
LKML-Reference: <20100312161316.9520.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9ab3cd7858d3..6eaef3df2d19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4902,7 +4902,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 	int ret;
 	cpumask_var_t mask;
 
-	if (len < cpumask_size())
+	if (len < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
 		return -EINVAL;
 
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4910,10 +4912,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 
 	ret = sched_getaffinity(pid, mask);
 	if (ret == 0) {
-		if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+		int retlen = min(len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
 			ret = -EFAULT;
 		else
-			ret = cpumask_size();
+			ret = retlen;
 	}
 	free_cpumask_var(mask);
 
-- 
cgit v1.2.3


From e3818b8dce2a934cd1521dbc4827e5238d8f45d8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 Mar 2010 17:03:43 -0700
Subject: rcu: Make rcu_read_lock_bh_held() allow for disabled BH

Disabling BH can stand in for rcu_read_lock_bh(), and this patch
updates rcu_read_lock_bh_held() to allow for this.  In order to
avoid include-file hell, this function is moved out of line to
kernel/rcupdate.c.

This fixes a false positive RCU warning.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <20100316000343.GA25857@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f1125c1a6321..63fe25433980 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/kernel_stat.h>
+#include <linux/hardirq.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -66,6 +67,28 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+/**
+ * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ *
+ * Check for bottom half being disabled, which covers both the
+ * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
+ * will show the situation.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ */
+int rcu_read_lock_bh_held(void)
+{
+	if (!debug_lockdep_rcu_enabled())
+		return 1;
+	return in_softirq();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+
+#end