diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2024-05-10 10:49:02 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2024-05-10 13:20:18 -0400 |
commit | 4232da23d75d173195c6766729e51947b64f83cd (patch) | |
tree | 7eb1420b118d452c0b497bf6267ecdfb0a6d5931 /arch/x86 | |
parent | bbe10a5cc0c775e52e91c8b3b6547b59b2054f44 (diff) | |
parent | 7b7e584f90bf670d5c6f2b1fff884bf3b972cad4 (diff) | |
download | linux-4232da23d75d173195c6766729e51947b64f83cd.tar.gz linux-4232da23d75d173195c6766729e51947b64f83cd.tar.bz2 linux-4232da23d75d173195c6766729e51947b64f83cd.zip |
Merge tag 'loongarch-kvm-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson into HEAD
LoongArch KVM changes for v6.10
1. Add ParaVirt IPI support.
2. Add software breakpoint support.
3. Add mmio trace events support.
Diffstat (limited to 'arch/x86')
61 files changed, 778 insertions, 386 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4fff6ed46e90..928820e61cb5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ACPI_HOTPLUG_CPU if ACPI_PROCESSOR && HOTPLUG_CPU select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_CONFIGURES_CPU_MITIGATIONS select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 @@ -2488,17 +2489,21 @@ config PREFIX_SYMBOLS def_bool y depends on CALL_PADDING && !CFI_CLANG -menuconfig SPECULATION_MITIGATIONS - bool "Mitigations for speculative execution vulnerabilities" +menuconfig CPU_MITIGATIONS + bool "Mitigations for CPU vulnerabilities" default y help - Say Y here to enable options which enable mitigations for - speculative execution hardware vulnerabilities. + Say Y here to enable options which enable mitigations for hardware + vulnerabilities (usually related to speculative execution). + Mitigations can be disabled or restricted to SMT systems at runtime + via the "mitigations" kernel parameter. - If you say N, all mitigations will be disabled. You really - should know what you are doing to say so. + If you say N, all mitigations will be disabled. This CANNOT be + overridden at runtime. -if SPECULATION_MITIGATIONS + Say 'Y', unless you really know what you are doing. + +if CPU_MITIGATIONS config MITIGATION_PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" @@ -2633,6 +2638,16 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst> +config MITIGATION_SPECTRE_BHI + bool "Mitigate Spectre-BHB (Branch History Injection)" + depends on CPU_SUP_INTEL + default y + help + Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks + where the branch history buffer is poisoned to speculatively steer + indirect branches. + See <file:Documentation/admin-guide/hw-vuln/spectre.rst> + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6356060caaf3..51cc9c7cb9bd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); - regs->ax = sys_call_table[unr](regs); + regs->ax = x64_sys_call(regs, unr); return true; } return false; @@ -66,7 +66,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call_table[xnr](regs); + regs->ax = x32_sys_call(regs, xnr); return true; } return false; @@ -162,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) if (likely(unr < IA32_NR_syscalls)) { unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[unr](regs); + regs->ax = ia32_sys_call(regs, unr); } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } @@ -189,7 +189,7 @@ static __always_inline bool int80_is_external(void) } /** - * int80_emulation - 32-bit legacy syscall entry + * do_int80_emulation - 32-bit legacy syscall C entry from asm * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in @@ -207,7 +207,7 @@ static __always_inline bool int80_is_external(void) * eax: system call number * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 */ -DEFINE_IDTENTRY_RAW(int80_emulation) +__visible noinstr void do_int80_emulation(struct pt_regs *regs) { int nr; @@ -255,6 +255,71 @@ DEFINE_IDTENTRY_RAW(int80_emulation) instrumentation_end(); syscall_exit_to_user_mode(regs); } + +#ifdef CONFIG_X86_FRED +/* + * A FRED-specific INT80 handler is warranted for the follwing reasons: + * + * 1) As INT instructions and hardware interrupts are separate event + * types, FRED does not preclude the use of vector 0x80 for external + * interrupts. As a result, the FRED setup code does not reserve + * vector 0x80 and calling int80_is_external() is not merely + * suboptimal but actively incorrect: it could cause a system call + * to be incorrectly ignored. + * + * 2) It is called only for handling vector 0x80 of event type + * EVENT_TYPE_SWINT and will never be called to handle any external + * interrupt (event type EVENT_TYPE_EXTINT). + * + * 3) FRED has separate entry flows depending on if the event came from + * user space or kernel space, and because the kernel does not use + * INT insns, the FRED kernel entry handler fred_entry_from_kernel() + * falls through to fred_bad_type() if the event type is + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling + * an INT insn, it can only be from a user level. + * + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will + * likely take a different approach if it is ever needed: it + * probably belongs in either fred_intx()/ fred_other() or + * asm_fred_entrypoint_user(), depending on if this ought to be done + * for all entries from userspace or only system + * calls. + * + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. + */ +DEFINE_FREDENTRY_RAW(int80_emulation) +{ + int nr; + + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * FRED pushed 0 into regs::orig_ax and regs::ax contains the + * syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif #else /* CONFIG_IA32_EMULATION */ /* Handles int $0x80 on a 32bit kernel */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8af2a26b24f6..1b5be07f8669 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -1491,3 +1492,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. + */ +SYM_FUNC_START(clear_bhb_loop) + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f + RET + .align 64, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b + RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_GPL(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index eabf48c4d4b4..c779046cc3fe 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -92,6 +92,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY /* * SYSENTER doesn't filter flags, so we need to clear NT and AC @@ -206,6 +207,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY movq %rsp, %rdi call do_fast_syscall_32 @@ -276,3 +278,17 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) + +/* + * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries + * point to C routines, however since this is a system call interface the branch + * history needs to be scrubbed to protect against BHI attacks, and that + * scrubbing needs to take place in assembly code prior to entering any C + * routines. + */ +SYM_CODE_START(int80_emulation) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + CLEAR_BRANCH_HISTORY + jmp do_int80_emulation +SYM_CODE_END(int80_emulation) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index ac120cbdaaf2..89c1476fcdd9 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -28,9 +28,9 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code if (regs->fred_cs.sl > 0) { pr_emerg("PANIC: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - die("invalid or fatal FRED event", regs, regs->orig_ax); + die("invalid or fatal FRED event", regs, error_code); panic("invalid or fatal FRED event"); } else { unsigned long flags = oops_begin(); @@ -38,10 +38,10 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code pr_alert("BUG: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + if (__die("Invalid or fatal FRED event", regs, error_code)) sig = 0; oops_end(flags, regs, sig); @@ -66,7 +66,7 @@ static noinstr void fred_intx(struct pt_regs *regs) /* INT80 */ case IA32_SYSCALL_VECTOR: if (ia32_enabled()) - return int80_emulation(regs); + return fred_int80_emulation(regs); fallthrough; #endif diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cfc9bc73e7f..c2235bae17ef 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -18,8 +18,25 @@ #include <asm/syscalls_32.h> #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#ifdef CONFIG_X86_32 #define __SYSCALL(nr, sym) __ia32_##sym, - -__visible const sys_call_ptr_t ia32_sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL +#endif + +#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); + +long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_32.h> + default: return __ia32_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index be120eec1fc9..33b3f09e6f15 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -11,8 +11,23 @@ #include <asm/syscalls_64.h> #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ #define __SYSCALL(nr, sym) __x64_##sym, - -asmlinkage const sys_call_ptr_t sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL + +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); + +long x64_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_64.h> + default: return __x64_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index bdd0e03a1265..03de4a932131 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -11,8 +11,12 @@ #include <asm/syscalls_x32.h> #undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); -asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { -#include <asm/syscalls_x32.h> +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_x32.h> + default: return __x64_sys_ni_syscall(regs); + } }; diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index a3c0df11d0e6..2fb7d53cf333 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) static bool write_ok_or_segv(unsigned long ptr, size_t size) { - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_err, this could go away. - */ - if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; @@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { - struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_err; long ret; unsigned long orig_dx; @@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code, goto sigsegv; } - tsk = current; - /* * Check for access_ok violations and find the syscall nr. * @@ -234,12 +225,8 @@ bool emulate_vsyscall(unsigned long error_code, goto do_ret; /* skip requested */ /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. + * With a real vsyscall, page faults cause SIGSEGV. */ - prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; - current->thread.sig_on_uaccess_err = 1; - ret = -EFAULT; switch (vsyscall_nr) { case 0: @@ -262,23 +249,12 @@ bool emulate_vsyscall(unsigned long error_code, break; } - current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; - check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ + goto sigsegv; } regs->ax = ret; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 09050641ce5d..5b0dd07b1ef1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1644,6 +1644,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 78cd5084104e..4367aa77cb8d 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1693,6 +1693,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; lbr->info = x86_pmu.lbr_info; + lbr->has_callstack = x86_pmu_has_lbr_callstack(); } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5fc45543e955..0569f579338b 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -105,7 +105,7 @@ static bool cpu_is_self(int cpu) * IPI implementation on Hyper-V. */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { struct hv_send_ipi_ex *ipi_arg; unsigned long flags; @@ -132,8 +132,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask, - exclude_self ? cpu_is_self : NULL); + nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask, + exclude_self ? cpu_is_self : NULL); /* * 'nr_bank <= 0' means some CPUs in cpumask can't be @@ -147,7 +147,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, - ipi_arg, NULL); + ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); @@ -155,7 +155,7 @@ ipi_mask_ex_done: } static bool __send_ipi_mask(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { |