From 2fbe7b25c8edaf2d10e6c1a4cc9f8afe714c4764 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 26 Sep 2006 10:52:27 +0200
Subject: [PATCH] i386/x86-64: Remove un/set_nmi_callback and
 reserve/release_lapic_nmi functions

Removes the un/set_nmi_callback and reserve/release_lapic_nmi functions as
they are no longer needed.  The various subsystems are modified to register
with the die_notifier instead.

Also includes compile fixes by Andrew Morton.

Signed-off-by:  Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/sysctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 362a0cc37138..83f168361624 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,8 +76,6 @@ extern int compat_log;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
-extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
-				  void __user *, size_t *, loff_t *);
 #endif
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -628,7 +626,7 @@ static ctl_table kern_table[] = {
 		.data           = &unknown_nmi_panic,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = &proc_unknown_nmi_panic,
+		.proc_handler   = &proc_dointvec,
 	},
 #endif
 #if defined(CONFIG_X86)
-- 
cgit v1.2.3


From 407984f1af259b31957c7c05075a454a751bb801 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 26 Sep 2006 10:52:27 +0200
Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog with sysctl

Adds a new /proc/sys/kernel/nmi call that will enable/disable the nmi
watchdog.

Signed-off-by:  Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/sysctl.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 83f168361624..040de6bd74dd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,9 @@ extern int compat_log;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+			void __user *, size_t *, loff_t *);
 #endif
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -628,6 +631,14 @@ static ctl_table kern_table[] = {
 		.mode           = 0644,
 		.proc_handler   = &proc_dointvec,
 	},
+	{
+		.ctl_name       = KERN_NMI_WATCHDOG,
+		.procname       = "nmi_watchdog",
+		.data           = &nmi_watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = &proc_nmi_enabled,
+	},
 #endif
 #if defined(CONFIG_X86)
 	{
-- 
cgit v1.2.3


From 8da5adda91df3d2fcc5300e68da491694c9af019 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 26 Sep 2006 10:52:27 +0200
Subject: [PATCH] x86: Allow users to force a panic on NMI

To quote Alan Cox:

The default Linux behaviour on an NMI of either memory or unknown is to
continue operation. For many environments such as scientific computing
it is preferable that the box is taken out and the error dealt with than
an uncorrected parity/ECC error get propogated.

A small number of systems do generate NMI's for bizarre random reasons
such as power management so the default is unchanged. In other respects
the new proc/sys entry works like the existing panic controls already in
that directory.

This is separate to the edac support - EDAC allows supported chipsets to
handle ECC errors well, this change allows unsupported cases to at least
panic rather than cause problems further down the line.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/panic.c  | 1 +
 kernel/sysctl.c | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 8010b9b17aca..d2db3e2209e0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
 #include <linux/debug_locks.h>
 
 int panic_on_oops;
+int panic_on_unrecovered_nmi;
 int tainted;
 static int pause_on_oops;
 static int pause_on_oops_flag;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 040de6bd74dd..220e20564124 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -641,6 +641,14 @@ static ctl_table kern_table[] = {
 	},
 #endif
 #if defined(CONFIG_X86)
+	{
+		.ctl_name	= KERN_PANIC_ON_NMI,
+		.procname	= "panic_on_unrecovered_nmi",
+		.data		= &panic_on_unrecovered_nmi,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= KERN_BOOTLOADER_TYPE,
 		.procname	= "bootloader_type",
-- 
cgit v1.2.3


From 3cfc348bf90ffaa777c188652aa297f04eb94de8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:28 +0200
Subject: [PATCH] x86: Add portable getcpu call

For NUMA optimization and some other algorithms it is useful to have a fast
to get the current CPU and node numbers in user space.

x86-64 added a fast way to do this in a vsyscall. This adds a generic
syscall for other architectures to make it a generic portable facility.

I expect some of them will also implement it as a faster vsyscall.

The cache is an optimization for the x86-64 vsyscall optimization. Since
what the syscall returns is an approximation anyways and user space
often wants very fast results it can be cached for some time.  The norma
methods to get this information in user space are relatively slow

The vsyscall is in a better position to manage the cache because it has direct
access to a fast time stamp (jiffies). For the generic syscall optimization
it doesn't help much, but enforce a valid argument to keep programs
portable

I only added an i386 syscall entry for now. Other architectures can follow
as needed.

AK: Also added some cleanups from Andrew Morton

Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/sys.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index e236f98f7ec5..3f894775488d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/tty.h>
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
+#include <linux/getcpu.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -2062,3 +2063,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 	}
 	return error;
 }
+
+asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
+	   		   struct getcpu_cache __user *cache)
+{
+	int err = 0;
+	int cpu = raw_smp_processor_id();
+	if (cpup)
+		err |= put_user(cpu, cpup);
+	if (nodep)
+		err |= put_user(cpu_to_node(cpu), nodep);
+	if (cache) {
+		/*
+		 * The cache is not needed for this implementation,
+		 * but make sure user programs pass something
+		 * valid. vsyscall implementations can instead make
+		 * good use of the cache. Only use t0 and t1 because
+		 * these are available in both 32bit and 64bit ABI (no
+		 * need for a compat_getcpu). 32bit has enough
+		 * padding
+		 */
+		unsigned long t0, t1;
+		get_user(t0, &cache->t0);
+		get_user(t1, &cache->t1);
+		t0++;
+		t1++;
+		put_user(t0, &cache->t0);
+		put_user(t1, &cache->t1);
+	}
+	return err ? -EFAULT : 0;
+}
-- 
cgit v1.2.3


From 0cb91a2293648507886563ccb91979cfc94d6a4b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:28 +0200
Subject: [PATCH] i386: Account spinlocks to the caller during profiling for
 !FP kernels

This ports the algorithm from x86-64 (with improvements) to i386.
Previously this only worked for frame pointer enabled kernels.
But spinlocks have a very simple stack frame that can be manually
analyzed. Do this.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/spinlock.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index fb524b009eef..9644a41e0bef 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -7,6 +7,11 @@
  *
  * This file contains the spinlock/rwlock implementations for the
  * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
  */
 
 #include <linux/linkage.h>
-- 
cgit v1.2.3


From 5a1b3999d6cb7ab87f1f3b1700bc91839fd6fa29 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:34 +0200
Subject: [PATCH] x86: Some preparationary cleanup for stack trace

- Remove unused all_contexts parameter
No caller used it
- Move skip argument into the structure (needed for
followon patches)

Cc: mingo@elte.hu

Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/lockdep.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9bad17884513..900b4cb1a024 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -224,7 +224,10 @@ static int save_trace(struct stack_trace *trace)
 	trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
 	trace->entries = stack_trace + nr_stack_trace_entries;
 
-	save_stack_trace(trace, NULL, 0, 3);
+	trace->skip = 3;
+	trace->all_contexts = 0;
+
+	save_stack_trace(trace, NULL);
 
 	trace->max_entries = trace->nr_entries;
 
-- 
cgit v1.2.3


From 3fa7c794fe4dc127f7fac3fad4d13628e68f89ce Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:34 +0200
Subject: [PATCH] Avoid recursion in lockdep when stack tracer takes locks

The new dwarf2 unwinder needs to take locks to do backtraces
inside modules. This patch makes sure lockdep which calls
stacktrace is not reentered.

Thanks to Ingo for suggesting this simpler approach.

Cc: mingo@elte.hu
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/lockdep.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 900b4cb1a024..c088e5542e84 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -227,7 +227,11 @@ static int save_trace(struct stack_trace *trace)
 	trace->skip = 3;
 	trace->all_contexts = 0;
 
+	/* Make sure to not recurse in case the the unwinder needs to tak
+e	   locks. */
+	lockdep_off();
 	save_stack_trace(trace, NULL);
+	lockdep_on();
 
 	trace->max_entries = trace->nr_entries;
 
-- 
cgit v1.2.3


From 0a4254058037eb172758961d0a5b94f4320a1425 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 26 Sep 2006 10:52:38 +0200
Subject: [PATCH] Add the canary field to the PDA area and the task struct

This patch adds the per thread cookie field to the task struct and the PDA.
Also it makes sure that the PDA value gets the new cookie value at context
switch, and that a new task gets a new cookie at task creation time.

Signed-off-by: Arjan van Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
CC: Andi Kleen <ak@suse.de>
---
 kernel/fork.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index f9b014e3e700..a0dad84567c9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -45,6 +45,7 @@
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
+#include <linux/random.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -175,6 +176,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	tsk->thread_info = ti;
 	setup_thread_stack(tsk, orig);
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+	tsk->stack_canary = get_random_int();
+#endif
+
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
-- 
cgit v1.2.3


From 3162f751d04086a9d006342de63ac8f44fe0f72a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 26 Sep 2006 10:52:39 +0200
Subject: [PATCH] Add the __stack_chk_fail() function

GCC emits a call to a __stack_chk_fail() function when the stack canary is
not matching the expected value.

Since this is a bad security issue; lets panic the kernel rather than limping
along; the kernel really can't be trusted anymore when this happens.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
CC: Andi Kleen <ak@suse.de>
---
 kernel/panic.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index d2db3e2209e0..6ceb664fb52a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -271,3 +271,15 @@ void oops_exit(void)
 {
 	do_oops_enter_exit();
 }
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Called when gcc's -fstack-protector feature is used, and
+ * gcc detects corruption of the on-stack canary value
+ */
+void __stack_chk_fail(void)
+{
+	panic("stack-protector: Kernel stack is corrupted");
+}
+EXPORT_SYMBOL(__stack_chk_fail);
+#endif
-- 
cgit v1.2.3


From adf1423698f00d00b267f7dca8231340ce7d65ef Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 26 Sep 2006 10:52:41 +0200
Subject: [PATCH] i386/x86-64: Work around gcc bug with noreturn functions in
 unwinder

Current gcc generates calls not jumps to noreturn functions. When that happens the
return address can point to the next function, which confuses the unwinder.

This patch works around it by marking asynchronous exception
frames in contrast normal call frames in the unwind information.  Then teach
the unwinder to decode this.

For normal call frames the unwinder now subtracts one from the address which avoids
this problem.  The standard libgcc unwinder uses the same trick.

It doesn't include adjustment of the printed address (i.e. for the original
example, it'd still be kernel_math_error+0 that gets displayed, but the
unwinder wouldn't get confused anymore.

This only works with binutils 2.6.17+ and some versions of H.J.Lu's 2.6.16
unfortunately because earlier binutils don't support .cfi_signal_frame

[AK: added automatic detection of the new binutils and wrote description]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/unwind.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index f69c804c8e62..3430475fcd88 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -603,6 +603,7 @@ int unwind(struct unwind_frame_info *frame)
 #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
 	const u32 *fde = NULL, *cie = NULL;
 	const u8 *ptr = NULL, *end = NULL;
+	unsigned long pc = UNW_PC(frame) - frame->call_frame;
 	unsigned long startLoc = 0, endLoc = 0, cfa;
 	unsigned i;
 	signed ptrType = -1;
@@ -612,7 +613,7 @@ int unwind(struct unwind_frame_info *frame)
 
 	if (UNW_PC(frame) == 0)
 		return -EINVAL;
-	if ((table = find_table(UNW_PC(frame))) != NULL
+	if ((table = find_table(pc)) != NULL
 	    && !(table->size & (sizeof(*fde) - 1))) {
 		unsigned long tableSize = table->size;
 
@@ -647,7 +648,7 @@ int unwind(struct unwind_frame_info *frame)
 			                        ptrType & DW_EH_PE_indirect
 			                        ? ptrType
 			                        : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
-			if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
+			if (pc >= startLoc && pc < endLoc)
 				break;
 			cie = NULL;
 		}
@@ -657,16 +658,28 @@ int unwind(struct unwind_frame_info *frame)
 		state.cieEnd = ptr; /* keep here temporarily */
 		ptr = (const u8 *)(cie + 2);
 		end = (const u8 *)(cie + 1) + *cie;
+		frame->call_frame = 1;
 		if ((state.version = *ptr) != 1)
 			cie = NULL; /* unsupported version */
 		else if (*++ptr) {
 			/* check if augmentation size is first (and thus present) */
 			if (*ptr == 'z') {
-				/* check for ignorable (or already handled)
-				 * nul-terminated augmentation string */
-				while (++ptr < end && *ptr)
-					if (strchr("LPR", *ptr) == NULL)
+				while (++ptr < end && *ptr) {
+					switch(*ptr) {
+					/* check for ignorable (or already handled)
+					 * nul-terminated augmentation string */
+					case 'L':
+					case 'P':
+					case 'R':
+						continue;
+					case 'S':
+						frame->call_frame = 0;
+						continue;
+					default:
 						break;
+					}
+					break;
+				}
 			}
 			if (ptr >= end || *ptr)
 				cie = NULL;
@@ -755,7 +768,7 @@ int unwind(struct unwind_frame_info *frame)
 	state.org = startLoc;
 	memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
 	/* process instructions */
-	if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
+	if (!processCFI(ptr, end, pc, ptrType, &state)
 	   || state.loc > endLoc
 	   || state.regs[retAddrReg].where == Nowhere
 	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
@@ -763,6 +776,11 @@ int unwind(struct unwind_frame_info *frame)
 	   || state.cfa.offs % sizeof(unsigned long))
 		return -EIO;
 	/* update frame */
+#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
+	if(frame->call_frame
+	   && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
+		frame->call_frame = 0;
+#endif
 	cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
 	startLoc = min((unsigned long)UNW_SP(frame), cfa);
 	endLoc = max((unsigned long)UNW_SP(frame), cfa);
@@ -866,6 +884,7 @@ int unwind_init_frame_info(struct unwind_frame_info *info,
                            /*const*/ struct pt_regs *regs)
 {
 	info->task = tsk;
+	info->call_frame = 0;
 	arch_unw_init_frame_info(info, regs);
 
 	return 0;
@@ -879,6 +898,7 @@ int unwind_init_blocked(struct unwind_frame_info *info,
                         struct task_struct *tsk)
 {
 	info->task = tsk;
+	info->call_frame = 0;
 	arch_unw_init_blocked(info);
 
 	return 0;
@@ -894,6 +914,7 @@ int unwind_init_running(struct unwind_frame_info *info,
                         void *arg)
 {
 	info->task = current;
+	info->call_frame = 0;
 
 	return arch_unwind_init_running(info, callback, arg);
 }
-- 
cgit v1.2.3