/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/arch/x86_64/entry.S
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
*
* entry.S contains the system-call and fault low-level handling routines.
*
* Some of this is documented in Documentation/x86/entry_64.rst
*
* A note on terminology:
* - iret frame: Architecture defined interrupt frame from SS to RIP
* at the top of the kernel process stack.
*
* Some macro usage:
* - SYM_FUNC_START/END:Define functions in the symbol table.
* - idtentry: Define exception entry points.
*/
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
#include <asm/thread_info.h>
#include <asm/hw_irq.h>
#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
#include <asm/percpu.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/frame.h>
#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
#include <asm/fsgsbase.h>
#include <linux/err.h>
#include "calling.h"
.code64
.section .entry.text, "ax"
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* This is the only entry point used for 64-bit system calls. The
* hardware interface is reasonably well designed and the register to
* argument mapping Linux uses fits well with the registers that are
* available when SYSCALL is used.
*
* SYSCALL instructions can be found inlined in libc implementations as
* well as some other programs and libraries. There are also a handful
* of SYSCALL instructions in the vDSO used, for example, as a
* clock_gettimeofday fallback.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_ENTRY
ENDBR
swapgs
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
movq %rsp, %rdi
/* Sign extend the lower 32bit as syscall numbers are treated as int */
movslq %eax, %rsi
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
call do_syscall_64 /* returns with IRQs disabled */
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
X86_FEATURE_XENPV
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
jne swapgs_restore_regs_and_return_to_usermode
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP.
*
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*
* Change top bits to match most significant bit (47th or 56th bit
* depending on paging mode) in the address.
*/
#ifdef CONFIG_X86_5LEVEL
ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
#else
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
#endif
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
jne swapgs_restore_regs_and_return_to_usermode
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
jne swapgs_restore_regs_and_return_to_usermode
movq R11(%rsp), %r11
cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
jne swapgs_restore_regs_and_return_to_usermode
/*
* SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
* restore RF properly. If the slowpath sets it for whatever reason, we
* need to restore it correctly.
*
* SYSRET can restore TF, but unlike IRET, restoring TF results in a
* trap from userspace immediately after SYSRET. This would cause an
* infinite loop whenever #DB happens with register state that satisfies
* the opportunistic SYSRET conditions. For example, single-stepping
* this user code:
*
* movq $stuck_here, %rcx
* pushfq
* popq %r11
* stuck_here:
*
* would never get past 'stuck_here'.
*/
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
jnz swapgs_restore_regs_and_return_to_usermode
/* nothing to check for RSP */
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
jne swapgs_restore_regs_and_return_to_usermode
/*
* We win! This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
IBRS_EXIT
POP_REGS pop_rdi=0
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
UNWIND_HINT_EMPTY
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_64)
/*
* %rdi:
|