diff options
Diffstat (limited to 'arch/x86')
71 files changed, 1179 insertions, 1153 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b3a1a5d77d92..798658048db3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -133,7 +133,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UID16 if X86_32 + select HAVE_UID16 if X86_32 || IA32_EMULATION select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING @@ -1002,19 +1002,41 @@ config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL -config VM86 - bool "Enable VM86 support" if EXPERT - default y +config X86_LEGACY_VM86 + bool "Legacy VM86 support (obsolete)" + default n depends on X86_32 ---help--- - This option is required by programs like DOSEMU to run - 16-bit real mode legacy code on x86 processors. It also may - be needed by software like XFree86 to initialize some video - cards via BIOS. Disabling this option saves about 6K. + This option allows user programs to put the CPU into V8086 + mode, which is an 80286-era approximation of 16-bit real mode. + + Some very old versions of X and/or vbetool require this option + for user mode setting. Similarly, DOSEMU will use it if + available to accelerate real mode DOS programs. However, any + recent version of DOSEMU, X, or vbetool should be fully + functional even without kernel VM86 support, as they will all + fall back to (pretty well performing) software emulation. + + Anything that works on a 64-bit kernel is unlikely to need + this option, as 64-bit kernels don't, and can't, support V8086 + mode. This option is also unrelated to 16-bit protected mode + and is not needed to run most 16-bit programs under Wine. + + Enabling this option adds considerable attack surface to the + kernel and slows down system calls and exception handling. + + Unless you use very old userspace or need the last drop of + performance in your real mode DOS games and can't use KVM, + say N here. + +config VM86 + bool + default X86_LEGACY_VM86 config X86_16BIT bool "Enable support for 16-bit segments" if EXPERT default y + depends on MODIFY_LDT_SYSCALL ---help--- This option is required by programs like Wine to run 16-bit protected mode legacy code on x86 processors. Disabling @@ -1509,6 +1531,7 @@ config X86_RESERVE_LOW config MATH_EMULATION bool + depends on MODIFY_LDT_SYSCALL prompt "Math emulation" if X86_32 ---help--- Linux can emulate a math coprocessor (used for floating point @@ -2053,6 +2076,22 @@ config CMDLINE_OVERRIDE This is used to work around broken boot loaders. This should be set to 'N' under normal conditions. +config MODIFY_LDT_SYSCALL + bool "Enable the LDT (local descriptor table)" if EXPERT + default y + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system + call. This is required to run 16-bit or segmented code such as + DOSEMU or some Wine programs. It is also used by some very old + threading libraries. + + Enabling this feature adds a small amount of overhead to + context switches and increases the low-level kernel attack + surface. Disabling it removes the modify_ldt(2) system call. + + Saying 'N' here may make sense for embedded or server kernels. + source "kernel/livepatch/Kconfig" endmenu @@ -2522,7 +2561,7 @@ config IA32_EMULATION depends on X86_64 select BINFMT_ELF select COMPAT_BINFMT_ELF - select HAVE_UID16 + select ARCH_WANT_OLD_COMPAT_IPC ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're @@ -2536,7 +2575,7 @@ config IA32_AOUT config X86_X32 bool "x32 ABI for 64-bit mode" - depends on X86_64 && IA32_EMULATION + depends on X86_64 ---help--- Include code to run binaries for the x32 native 32-bit ABI for 64-bit processors. An x32 process gets access to the @@ -2550,7 +2589,6 @@ config X86_X32 config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 - select ARCH_WANT_OLD_COMPAT_IPC if COMPAT config COMPAT_FOR_U64_ALIGNMENT diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 118e6debc483..054ff969fcdb 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs endif +# +# Prevent GCC from generating any FP code by mistake. +# +# This must happen before we try the -mpreferred-stack-boundary, see: +# +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 +# +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow +KBUILD_CFLAGS += $(call cc-option,-mno-avx,) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -# prevent gcc from generating any FP code by mistake -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index d7b1f655b3ef..6a9b96b4624d 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -82,7 +82,7 @@ static unsigned long get_random_long(void) if (has_cpuflag(X86_FEATURE_TSC)) { debug_putstr(" RDTSC"); - rdtscll(raw); + raw = rdtsc(); random ^= raw; use_i8254 = false; diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 7a144971db79..bd55dedd7614 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,6 +2,7 @@ # Makefile for the x86 low level entry code # obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o +obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index f4e6308c4200..3c71dd947c7b 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) .endm - .macro SAVE_EXTRA_REGS_RBP offset=0 - movq %rbp, 4*8+\offset(%rsp) - .endm .macro RESTORE_EXTRA_REGS offset=0 movq 0*8+\offset(%rsp), %r15 @@ -193,12 +190,6 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_EXCEPT_RCX_R11 RESTORE_C_REGS_HELPER 1,0,0,1,1 .endm - .macro RESTORE_RSI_RDI - RESTORE_C_REGS_HELPER 0,0,0,0,0 - .endm - .macro RESTORE_RSI_RDI_RDX - RESTORE_C_REGS_HELPER 0,0,0,0,1 - .endm .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 subq $-(15*8+\addskip), %rsp diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 000000000000..a3e9c7fa15d9 --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,375 @@ +/* + * common.c - C code for kernel entry and exit + * Copyright (c) 2015 Andrew Lutomirski + * GPL v2 + * + * Based on asm and ptrace code by many authors. The code here originated + * in ptrace.c and signal.c. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/tracehook.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> +#include <linux/export.h> +#include <linux/context_tracking.h> +#include <linux/user-return-notifier.h> +#include <linux/uprobes.h> + +#include <asm/desc.h> +#include <asm/traps.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +#ifdef CONFIG_CONTEXT_TRACKING +/* Called on entry from user mode with IRQs off. */ +__visible void enter_from_user_mode(void) +{ + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit(); +} +#endif + +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else +#endif + { + audit_syscall_entry(regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + +/* + * We can return 0 to resume the syscall or anything else to go to phase + * 2. If we resume the syscall, we need to put something appropriate in + * regs->orig_ax. + * + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax + * are fully functional. + * + * For phase 2's benefit, our return value is: + * 0: resume the syscall + * 1: go to phase 2; no seccomp phase 2 needed + * anything else: go to phase 2; pass return value to seccomp + */ +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) +{ + unsigned long ret = 0; + u32 work; + + BUG_ON(regs != task_pt_regs(current)); + + work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + +#ifdef CONFIG_CONTEXT_TRACKING + /* + * If TIF_NOHZ is set, we are required to call user_exit() before + * doing anything that could touch RCU. + */ + if (work & _TIF_NOHZ) { + enter_from_user_mode(); + work &= ~_TIF_NOHZ; + } +#endif + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp first -- it should minimize exposure of other + * code, and keeping seccomp fast is probably more valuable + * than the rest of this. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); + + ret = seccomp_phase1(&sd); + if (ret == SECCOMP_PHASE1_SKIP) { + regs->orig_ax = -1; + ret = 0; + } else if (ret != SECCOMP_PHASE1_OK) { + return ret; /* Go directly to phase 2 */ + } + + work &= ~_TIF_SECCOMP; + } +#endif + + /* Do our best to finish without phase 2. */ + if (work == 0) + return ret; /* seccomp and/or nohz only (ret == 0 here) */ + +#ifdef CONFIG_AUDITSYSCALL + if (work == _TIF_SYSCALL_AUDIT) { + /* + * If there is no more work to be done except auditing, + * then audit in phase 1. Phase 2 always audits, so, if + * we audit here, then we can't go on to phase 2. + */ + do_audit_syscall_entry(regs, arch); + return 0; + } +#endif + + return 1; /* Something is enabled that we can't handle in phase 1 */ +} + +/* Returns the syscall nr to run (which should match regs->orig_ax). */ +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, + unsigned long phase1_result) +{ + long ret = 0; + u32 work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + BUG_ON(regs != task_pt_regs(current)); + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (work & _TIF_SINGLESTEP) + regs->flags |= X86_EFLAGS_TF; + +#ifdef CONFIG_SECCOMP + /* + * Call seccomp_phase2 before running the other hooks so that + * they can see any changes made by a seccomp tracer. + */ + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { + /* seccomp failures shouldn't expose any additional code. */ + return -1; + } +#endif + + if (unlikely(work & _TIF_SYSCALL_EMU)) + ret = -1L; + + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); + + do_audit_syscall_entry(regs, arch); + + return ret ?: regs->orig_ax; +} + +long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); + + if (phase1_result == 0) + return regs->orig_ax; + else + return syscall_trace_enter_phase2(regs, arch, phase1_result); +} + +/* Deprecated. */ +void syscall_trace_leave(struct pt_regs *regs) +{ + bool step; + + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + user_exit(); + + audit_syscall_exit(regs); + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); + + user_enter(); +} + +static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) +{ + unsigned long top_of_stack = + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; + return (struct thread_info *)(top_of_stack - THREAD_SIZE); +} + +/* Called with IRQs disabled. */ +__visible void prepare_exit_to_usermode(struct pt_regs *regs) +{ + if (WARN_ON(!irqs_disabled())) + local_irq_disable(); + + /* + * In order to return to user mode, we need to have IRQs off with + * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, + * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags + * can be set at any time on preemptable kernels if we have IRQs on, + * so we need to loop. Disabling preemption wouldn't help: doing the + * work to clear some of the flags can sleep. + */ + while (true) { + u32 cached_flags = + READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | + _TIF_UPROBE | _TIF_NEED_RESCHED | + _TIF_USER_RETURN_NOTIFY))) + break; + + /* We have work to do. */ + local_irq_enable(); + + if (cached_flags & _TIF_NEED_RESCHED) + schedule(); + + if (cached_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + /* deal with pending signal delivery */ |