From 59bd54a84d15e9335de5b8abe7b3b9713a36b99b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:10 +0300 Subject: x86/tdx: Detect running as a TDX guest in early boot In preparation of extending cc_platform_has() API to support TDX guest, use CPUID instruction to detect support for TDX guests in the early boot code (via tdx_early_init()). Since copy_bootdata() is the first user of cc_platform_has() API, detect the TDX guest status before it. Define a synthetic feature flag (X86_FEATURE_TDX_GUEST) and set this bit in a valid TDX guest platform. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-2-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 12 ++++++++++++ arch/x86/coco/Makefile | 2 ++ arch/x86/coco/tdx/Makefile | 3 +++ arch/x86/coco/tdx/tdx.c | 22 ++++++++++++++++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/include/asm/tdx.h | 21 +++++++++++++++++++++ arch/x86/kernel/head64.c | 4 ++++ 8 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 arch/x86/coco/tdx/Makefile create mode 100644 arch/x86/coco/tdx/tdx.c create mode 100644 arch/x86/include/asm/tdx.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..4ae27322869d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -878,6 +878,18 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config INTEL_TDX_GUEST + bool "Intel TDX (Trust Domain Extensions) - Guest Support" + depends on X86_64 && CPU_SUP_INTEL + depends on X86_X2APIC + help + Support running as a guest under Intel TDX. Without this support, + the guest kernel can not boot or run under TDX. + TDX includes memory encryption and integrity capabilities + which protect the confidentiality and integrity of guest + memory contents and CPU state. TDX guests are protected from + some attacks from the VMM. + endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile index c1ead00017a7..c816acf78b6a 100644 --- a/arch/x86/coco/Makefile +++ b/arch/x86/coco/Makefile @@ -4,3 +4,5 @@ KASAN_SANITIZE_core.o := n CFLAGS_core.o += -fno-stack-protector obj-y += core.o + +obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/ diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile new file mode 100644 index 000000000000..c929d53ee059 --- /dev/null +++ b/arch/x86/coco/tdx/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += tdx.o diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c new file mode 100644 index 000000000000..97674471fd1e --- /dev/null +++ b/arch/x86/coco/tdx/tdx.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021-2022 Intel Corporation */ + +#undef pr_fmt +#define pr_fmt(fmt) "tdx: " fmt + +#include +#include + +void __init tdx_early_init(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; + + setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); + + pr_info("Guest detected\n"); +} diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..20df73b51025 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -238,6 +238,7 @@ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ #define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 1231d63f836d..b37de8268c9a 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -68,6 +68,12 @@ # define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) #endif +#ifdef CONFIG_INTEL_TDX_GUEST +# define DISABLE_TDX_GUEST 0 +#else +# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -79,7 +85,7 @@ #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 0 +#define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h new file mode 100644 index 000000000000..ba8042ce61c2 --- /dev/null +++ b/arch/x86/include/asm/tdx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2022 Intel Corporation */ +#ifndef _ASM_X86_TDX_H +#define _ASM_X86_TDX_H + +#include + +#define TDX_CPUID_LEAF_ID 0x21 +#define TDX_IDENT "IntelTDX " + +#ifdef CONFIG_INTEL_TDX_GUEST + +void __init tdx_early_init(void); + +#else + +static inline void tdx_early_init(void) { }; + +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4f5ecbbaae77..6dff50c3edd6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * Manage page tables very early on. @@ -514,6 +515,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) idt_setup_early_handler(); + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + copy_bootdata(__va(real_mode_data)); /* -- cgit v1.2.3 From 527a534c732604931959e73e9c3a8952d8c1a994 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:11 +0300 Subject: x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers Secure Arbitration Mode (SEAM) is an extension of VMX architecture. It defines a new VMX root operation (SEAM VMX root) and a new VMX non-root operation (SEAM VMX non-root) which are both isolated from the legacy VMX operation where the host kernel runs. A CPU-attested software module (called 'TDX module') runs in SEAM VMX root to manage and protect VMs running in SEAM VMX non-root. SEAM VMX root is also used to host another CPU-attested software module (called 'P-SEAMLDR') to load and update the TDX module. Host kernel transits to either P-SEAMLDR or TDX module via the new SEAMCALL instruction, which is essentially a VMExit from VMX root mode to SEAM VMX root mode. SEAMCALLs are leaf functions defined by P-SEAMLDR and TDX module around the new SEAMCALL instruction. A guest kernel can also communicate with TDX module via TDCALL instruction. TDCALLs and SEAMCALLs use an ABI different from the x86-64 system-v ABI. RAX is used to carry both the SEAMCALL leaf function number (input) and the completion status (output). Additional GPRs (RCX, RDX, R8-R11) may be further used as both input and output operands in individual leaf. TDCALL and SEAMCALL share the same ABI and require the largely same code to pass down arguments and retrieve results. Define an assembly macro that can be used to implement C wrapper for both TDCALL and SEAMCALL. Suggested-by: Thomas Gleixner Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-3-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/tdx.h | 29 +++++++++++++ arch/x86/kernel/asm-offsets.c | 9 ++++ arch/x86/virt/vmx/tdx/tdxcall.S | 96 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 arch/x86/virt/vmx/tdx/tdxcall.S diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index ba8042ce61c2..cb4c4e607c43 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -4,10 +4,38 @@ #define _ASM_X86_TDX_H #include +#include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +/* + * SW-defined error codes. + * + * Bits 47:40 == 0xFF indicate Reserved status code class that never used by + * TDX module. + */ +#define TDX_ERROR _BITUL(63) +#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) +#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) + +#ifndef __ASSEMBLY__ + +/* + * Used to gather the output registers values of the TDCALL and SEAMCALL + * instructions when requesting services from the TDX module. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_module_output { + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 r10; + u64 r11; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); @@ -18,4 +46,5 @@ static inline void tdx_early_init(void) { }; #endif /* CONFIG_INTEL_TDX_GUEST */ +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9fb0a2f8b62a..7dca52f5cfc6 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -65,6 +66,14 @@ static void __used common(void) OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); #endif + BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S new file mode 100644 index 000000000000..49a54356ae99 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdxcall.S @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + +/* + * TDCALL and SEAMCALL are supported in Binutils >= 2.36. + */ +#define tdcall .byte 0x66,0x0f,0x01,0xcc +#define seamcall .byte 0x66,0x0f,0x01,0xcf + +/* + * TDX_MODULE_CALL - common helper macro for both + * TDCALL and SEAMCALL instructions. + * + * TDCALL - used by TDX guests to make requests to the + * TDX module and hypercalls to the VMM. + * SEAMCALL - used by TDX hosts to make requests to the + * TDX module. + */ +.macro TDX_MODULE_CALL host:req + /* + * R12 will be used as temporary storage for struct tdx_module_output + * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL + * services supported by this function, it can be reused. + */ + + /* Callee saved, so preserve it */ + push %r12 + + /* + * Push output pointer to stack. + * After the operation, it will be fetched into R12 register. + */ + push %r9 + + /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */ + /* Move Leaf ID to RAX */ + mov %rdi, %rax + /* Move input 4 to R9 */ + mov %r8, %r9 + /* Move input 3 to R8 */ + mov %rcx, %r8 + /* Move input 1 to RCX */ + mov %rsi, %rcx + /* Leave input param 2 in RDX */ + + .if \host + seamcall + /* + * SEAMCALL instruction is essentially a VMExit from VMX root + * mode to SEAM VMX root mode. VMfailInvalid (CF=1) indicates + * that the targeted SEAM firmware is not loaded or disabled, + * or P-SEAMLDR is busy with another SEAMCALL. %rax is not + * changed in this case. + * + * Set %rax to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid. + * This value will never be used as actual SEAMCALL error code as + * it is from the Reserved status code class. + */ + jnc .Lno_vmfailinvalid + mov $TDX_SEAMCALL_VMFAILINVALID, %rax +.Lno_vmfailinvalid: + + .else + tdcall + .endif + + /* + * Fetch output pointer from stack to R12 (It is used + * as temporary storage) + */ + pop %r12 + + /* + * Since this macro can be invoked with NULL as an output pointer, + * check if caller provided an output struct before storing output + * registers. + * + * Update output registers, even if the call failed (RAX != 0). + * Other registers may contain details of the failure. + */ + test %r12, %r12 + jz .Lno_output_struct + + /* Copy result registers to output struct: */ + movq %rcx, TDX_MODULE_rcx(%r12) + movq %rdx, TDX_MODULE_rdx(%r12) + movq %r8, TDX_MODULE_r8(%r12) + movq %r9, TDX_MODULE_r9(%r12) + movq %r10, TDX_MODULE_r10(%r12) + movq %r11, TDX_MODULE_r11(%r12) + +.Lno_output_struct: + /* Restore the state of R12 register */ + pop %r12 +.endm -- cgit v1.2.3 From eb94f1b6a70a683040d60d21bbb6ad65f082600a Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:12 +0300 Subject: x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions Guests communicate with VMMs with hypercalls. Historically, these are implemented using instructions that are known to cause VMEXITs like VMCALL, VMLAUNCH, etc. However, with TDX, VMEXITs no longer expose the guest state to the host. This prevents the old hypercall mechanisms from working. So, to communicate with VMM, TDX specification defines a new instruction called TDCALL. In a TDX based VM, since the VMM is an untrusted entity, an intermediary layer -- TDX module -- facilitates secure communication between the host and the guest. TDX module is loaded like a firmware into a special CPU mode called SEAM. TDX guests communicate with the TDX module using the TDCALL instruction. A guest uses TDCALL to communicate with both the TDX module and VMM. The value of the RAX register when executing the TDCALL instruction is used to determine the TDCALL type. A leaf of TDCALL used to communicate with the VMM is called TDVMCALL. Add generic interfaces to communicate with the TDX module and VMM (using the TDCALL instruction). __tdx_module_call() - Used to communicate with the TDX module (via TDCALL instruction). __tdx_hypercall() - Used by the guest to request services from the VMM (via TDVMCALL leaf of TDCALL). Also define an additional wrapper _tdx_hypercall(), which adds error handling support for the TDCALL failure. The __tdx_module_call() and __tdx_hypercall() helper functions are implemented in assembly in a .S file. The TDCALL ABI requires shuffling arguments in and out of registers, which proved to be awkward with inline assembly. Just like syscalls, not all TDVMCALL use cases need to use the same number of argument registers. The implementation here picks the current worst-case scenario for TDCALL (4 registers). For TDCALLs with fewer than 4 arguments, there will end up being a few superfluous (cheap) instructions. But, this approach maximizes code reuse. For registers used by the TDCALL instruction, please check TDX GHCI specification, the section titled "TDCALL instruction" and "TDG.VP.VMCALL Interface". Based on previous patch by Sean Christopherson. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20220405232939.73860-4-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/Makefile | 2 +- arch/x86/coco/tdx/tdcall.S | 191 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/coco/tdx/tdx.c | 24 ++++++ arch/x86/include/asm/tdx.h | 30 +++++++ arch/x86/kernel/asm-offsets.c | 8 ++ 5 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 arch/x86/coco/tdx/tdcall.S diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index c929d53ee059..46c55998557d 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o +obj-y += tdx.o tdcall.o diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S new file mode 100644 index 000000000000..662479ccf630 --- /dev/null +++ b/arch/x86/coco/tdx/tdcall.S @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#include +#include +#include + +#include "../../virt/vmx/tdx/tdxcall.S" + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \ + TDX_R12 | TDX_R13 | \ + TDX_R14 | TDX_R15 ) + +/* + * __tdx_module_call() - Used by TDX guests to request services from + * the TDX module (does not include VMM services) using TDCALL instruction. + * + * Transforms function call register arguments into the TDCALL register ABI. + * After TDCALL operation, TDX module output is saved in @out (if it is + * provided by the user). + * + *------------------------------------------------------------------------- + * TDCALL ABI: + *------------------------------------------------------------------------- + * Input Registers: + * + * RAX - TDCALL Leaf number. + * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers. + * + * Output Registers: + * + * RAX - TDCALL instruction error code. + * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers. + * + *------------------------------------------------------------------------- + * + * __tdx_module_call() function ABI: + * + * @fn (RDI) - TDCALL Leaf ID, moved to RAX + * @rcx (RSI) - Input parameter 1, moved to RCX + * @rdx (RDX) - Input parameter 2, moved to RDX + * @r8 (RCX) - Input parameter 3, moved to R8 + * @r9 (R8) - Input parameter 4, moved to R9 + * + * @out (R9) - struct tdx_module_output pointer + * stored temporarily in R12 (not + * shared with the TDX module). It + * can be NULL. + * + * Return status of TDCALL via RAX. + */ +SYM_FUNC_START(__tdx_module_call) + FRAME_BEGIN + TDX_MODULE_CALL host=0 + FRAME_END + ret +SYM_FUNC_END(__tdx_module_call) + +/* + * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf + * of TDCALL instruction + * + * Transforms values in function call argument struct tdx_hypercall_args @args + * into the TDCALL register ABI. After TDCALL operation, VMM output is saved + * back in @args. + * + *------------------------------------------------------------------------- + * TD VMCALL ABI: + *------------------------------------------------------------------------- + * + * Input Registers: + * + * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL) + * RCX - BITMAP which controls which part of TD Guest GPR + * is passed as-is to the VMM and back. + * R10 - Set 0 to indicate TDCALL follows standard TDX ABI + * specification. Non zero value indicates vendor + * specific ABI. + * R11 - VMCALL sub function number + * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments. + * R8-R9, R12-R15 - Same as above. + * + * Output Registers: + * + * RAX - TDCALL instruction status (Not related to hypercall + * output). + * R10 - Hypercall output error code. + * R11-R15 - Hypercall sub function specific output values. + * + *------------------------------------------------------------------------- + * + * __tdx_hypercall() function ABI: + * + * @args (RDI) - struct tdx_hypercall_args for input and output + * @flags (RSI) - TDX_HCALL_* flags + * + * On successful completion, return the hypercall error code. + */ +SYM_FUNC_START(__tdx_hypercall) + FRAME_BEGIN + + /* Save callee-saved GPRs as mandated by the x86_64 ABI */ + push %r15 + push %r14 + push %r13 + push %r12 + + /* Mangle function call ABI into TDCALL ABI: */ + /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */ + xor %eax, %eax + + /* Copy hypercall registers from arg struct: */ + movq TDX_HYPERCALL_r10(%rdi), %r10 + movq TDX_HYPERCALL_r11(%rdi), %r11 + movq TDX_HYPERCALL_r12(%rdi), %r12 + movq TDX_HYPERCALL_r13(%rdi), %r13 + movq TDX_HYPERCALL_r14(%rdi), %r14 + movq TDX_HYPERCALL_r15(%rdi), %r15 + + movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx + + tdcall + + /* + * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that + * something has gone horribly wrong with the TDX module. + * + * The return status of the hypercall operation is in a separate + * register (in R10). Hypercall errors are a part of normal operation + * and are handled by callers. + */ + testq %rax, %rax + jne .Lpanic + + /* TDVMCALL leaf return code is in R10 */ + movq %r10, %rax + + /* Copy hypercall result registers to arg struct if needed */ + testq $TDX_HCALL_HAS_OUTPUT, %rsi + jz .Lout + + movq %r10, TDX_HYPERCALL_r10(%rdi) + movq %r11, TDX_HYPERCALL_r11(%rdi) + movq %r12, TDX_HYPERCALL_r12(%rdi) + movq %r13, TDX_HYPERCALL_r13(%rdi) + movq %r14, TDX_HYPERCALL_r14(%rdi) + movq %r15, TDX_HYPERCALL_r15(%rdi) +.Lout: + /* + * Zero out registers exposed to the VMM to avoid speculative execution + * with VMM-controlled values. This needs to include all registers + * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15 + * context will be restored. + */ + xor %r10d, %r10d + xor %r11d, %r11d + + /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + FRAME_END + + retq +.Lpanic: + call __tdx_hypercall_failed + /* __tdx_hypercall_failed never returns */ + jmp .Lpanic +SYM_FUNC_END(__tdx_hypercall) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 97674471fd1e..4b57880e45b0 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -7,6 +7,30 @@ #include #include +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = fn, + .r12 = r12, + .r13 = r13, + .r14 = r14, + .r15 = r15, + }; + + return __tdx_hypercall(&args, 0); +} + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + panic("TDVMCALL failed. TDX module bug?"); +} + void __init tdx_early_init(void) { u32 eax, sig[3]; diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index cb4c4e607c43..a33d47abe67d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -3,12 +3,17 @@ #ifndef _ASM_X86_TDX_H #define _ASM_X86_TDX_H +#include #include #include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +#define TDX_HYPERCALL_STANDARD 0 + +#define TDX_HCALL_HAS_OUTPUT BIT(0) + /* * SW-defined error codes. * @@ -36,10 +41,35 @@ struct tdx_module_output { u64 r11; }; +/* + * Used in __tdx_hypercall() to pass down and get back registers' values of + * the TDCALL instruction when requesting services from the VMM. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_hypercall_args { + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); +/* Used to communicate with the TDX module */ +u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out); + +/* Used to request services from the VMM */ +u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 7dca52f5cfc6..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -74,6 +74,14 @@ static void __used common(void) OFFSET(TDX_MODULE_r10, tdx_module_output, r10); OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + BLANK(); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); -- cgit v1.2.3 From 41394e33f3a0ce791caf0e086e1fca850832ddec Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:13 +0300 Subject: x86/tdx: Extend the confidential computing API to support TDX guests Confidential Computing (CC) features (like string I/O unroll support, memory encryption/decryption support, etc) are conditionally enabled in the kernel using cc_platform_has() API. Since TDX guests also need to use these CC features, extend cc_platform_has() API and add TDX guest-specific CC attributes support. CC API also provides an interface to deal with encryption mask. Extend it to cover TDX. Details about which bit in the page table entry to be used to indicate shared/private state is determined by using the TDINFO TDCALL. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20220405232939.73860-5-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 1 + arch/x86/coco/core.c | 12 ++++++++++++ arch/x86/coco/tdx/tdx.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4ae27322869d..984315ca0275 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -882,6 +882,7 @@ config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC + select ARCH_HAS_CC_PLATFORM help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index fc1365dd927e..3f3008783e05 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -87,9 +87,18 @@ EXPORT_SYMBOL_GPL(cc_platform_has); u64 cc_mkenc(u64 val) { + /* + * Both AMD and Intel use a bit in the page table to indicate + * encryption status of the page. + * + * - for AMD, bit *set* means the page is encrypted + * - for Intel *clear* means encrypted. + */ switch (vendor) { case CC_VENDOR_AMD: return val | cc_mask; + case CC_VENDOR_INTEL: + return val & ~cc_mask; default: return val; } @@ -97,9 +106,12 @@ u64 cc_mkenc(u64 val) u64 cc_mkdec(u64 val) { + /* See comment in cc_mkenc() */ switch (vendor) { case CC_VENDOR_AMD: return val & ~cc_mask; + case CC_VENDOR_INTEL: + return val | cc_mask; default: return val; } diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 4b57880e45b0..96b2611baac5 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -5,8 +5,12 @@ #define pr_fmt(fmt) "tdx: " fmt #include +#include #include +/* TDX module Call Leaf IDs */ +#define TDX_GET_INFO 1 + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -31,8 +35,47 @@ void __tdx_hypercall_failed(void) panic("TDVMCALL failed. TDX module bug?"); } +/* + * Used for TDX guests to make calls directly to the TD module. This + * should only be used for calls that have no legitimate reason to fail + * or where the kernel can not survive the call failing. + */ +static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out) +{ + if (__tdx_module_call(fn, rcx, rdx, r8, r9, out)) + panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); +} + +static u64 get_cc_mask(void) +{ + struct tdx_module_output out; + unsigned int gpa_width; + + /* + * TDINFO TDX module call is used to get the TD execution environment + * information like GPA width, number of available vcpus, debug mode + * information, etc. More details about the ABI can be found in TDX + * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL + * [TDG.VP.INFO]. + * + * The GPA width that comes out of this call is critical. TDX guests + * can not meaningfully run without it. + */ + tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out); + + gpa_width = out.rcx & GENMASK(5, 0); + + /* + * The highest bit of a guest physical address is the "sharing" bit. + * Set it for shared pages and clear it for private pages. + */ + return BIT_ULL(gpa_width - 1); +} + void __init tdx_early_init(void) { + u64 cc_mask; u32 eax, sig[3]; cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); @@ -42,5 +85,9 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); + cc_set_vendor(CC_VENDOR_INTEL); + cc_mask = get_cc_mask(); + cc_set_mask(cc_mask); + pr_info("Guest detected\n"); } -- cgit v1.2.3 From 65fab5bc033aad1a9faf976caec46558c2f88319 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:14 +0300 Subject: x86/tdx: Exclude shared bit from __PHYSICAL_MASK In TDX guests, by default memory is protected from host access. If a guest needs to communicate with the VMM (like the I/O use case), it uses a single bit in the physical address to communicate the protected/shared attribute of the given page. In the x86 ARCH code, __PHYSICAL_MASK macro represents the width of the physical address in the given architecture. It is used in creating physical PAGE_MASK for address bits in the kernel. Since in TDX guest, a single bit is used as metadata, it needs to be excluded from valid physical address bits to avoid using incorrect addresses bits in the kernel. Enable DYNAMIC_PHYSICAL_MASK to support updating the __PHYSICAL_MASK. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-6-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 1 + arch/x86/coco/tdx/tdx.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 984315ca0275..aea4cc404c31 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -883,6 +883,7 @@ config INTEL_TDX_GUEST depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC select ARCH_HAS_CC_PLATFORM + select DYNAMIC_PHYSICAL_MASK help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 96b2611baac5..e84f6dd3ed2a 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -89,5 +89,13 @@ void __init tdx_early_init(void) cc_mask = get_cc_mask(); cc_set_mask(cc_mask); + /* + * All bits above GPA width are reserved and kernel treats shared bit + * as flag, not as part of physical address. + * + * Adjust physical mask to only cover valid GPA bits. + */ + physical_mask &= cc_mask - 1; + pr_info("Guest detected\n"); } -- cgit v1.2.3 From 775acc82a88fd36f5e89a08d39874fdeeaa04247 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:15 +0300 Subject: x86/traps: Refactor exc_general_protection() TDX brings a new exception -- Virtualization Exception (#VE). Handling of #VE structurally very similar to handling #GP. Extract two helpers from exc_general_protection() that can be reused for handling #VE. No functional changes. Suggested-by: Thomas Gleixner Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-7-kirill.shutemov@linux.intel.com --- arch/x86/kernel/traps.c | 57 +++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1563fb995005..db8d22a0d003 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -686,13 +686,40 @@ static bool try_fixup_enqcmd_gp(void) #endif } +static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + if (fixup_exception(regs, trapnr, error_code, 0)) + return true; + + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, trapnr)) + return true; + + return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; +} + +static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + show_signal(current, SIGSEGV, "", str, regs, error_code); + force_sig(SIGSEGV); +} + DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; - struct task_struct *tsk; unsigned long gp_addr; - int ret; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; @@ -711,40 +738,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) return; } - tsk = current; - if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; - show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV); + gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } - if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - goto exit; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - - /* - * To be potentially processing a kprobe fault and to trust the result - * from kprobe_running(), we have to be non-preemptible. - */ - if (!preemptible() && - kprobe_running() && - kprobe_fault_handler(regs, X86_TRAP_GP)) - goto exit; - - ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); - if (ret == NOTIFY_STOP) + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) goto exit; if (error_code) -- cgit v1.2.3 From 9a22bf6debbf5169f750af53c7f86eb4e3cd6712 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:16 +0300 Subject: x86/traps: Add #VE support for TDX guest Virtualization Exceptions (#VE) are delivered to TDX guests due to specific guest actions which may happen in either user space or the kernel: * Specific instructions (WBINVD, for example) * Specific MSR accesses * Specific CPUID leaf accesses * Access to specific guest physical addresses Syscall entry code has a critical window where the kernel stack is not yet set up. Any exception in this window leads to hard to debug issues and can be exploited for privilege escalation. Exceptions in the NMI entry code also cause issues. Returning from the exception handler with IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. For these reasons, the kernel avoids #VEs during the syscall gap and the NMI entry code. Entry code paths do not access TD-shared memory, MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves that might generate #VE. VMM can remove memory from TD at any point, but access to unaccepted (or missing) private memory leads to VM termination, not to #VE. Similarly to page faults and breakpoints, #VEs are allowed in NMI handlers once the kernel is ready to deal with nested NMIs. During #VE delivery, all interrupts, including NMIs, are blocked until TDGETVEINFO is called. It prevents #VE nesting until the kernel reads the VE info. TDGETVEINFO retrieves the #VE info from the TDX module, which also clears the "#VE valid" flag. This must be done before anything else as any #VE that occurs while the valid flag is set escalates to #DF by TDX module. It will result in an oops. Virtual NMIs are inhibited if the #VE valid flag is set. NMI will not be delivered until TDGETVEINFO is called. For now, convert unhandled #VE's (everything, until later in this series) so that they appear just like a #GP by calling the ve_raise_fault() directly. The ve_raise_fault() function is similar to #GP handler and is responsible for sending SIGSEGV to userspace and CPU die and notifying debuggers and other die chain users. Co-developed-by: Sean Christopherson Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-8-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 38 ++++++++++++++++++ arch/x86/include/asm/idtentry.h | 4 ++ arch/x86/include/asm/tdx.h | 21 ++++++++++ arch/x86/kernel/idt.c | 3 ++ arch/x86/kernel/traps.c | 86 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 152 insertions(+) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index e84f6dd3ed2a..60a3f2ff5b95 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -10,6 +10,7 @@ /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 +#define TDX_GET_VEINFO 3 /* * Wrapper for standard use of __tdx_hypercall with no output aside from @@ -73,6 +74,43 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +void tdx_get_ve_info(struct ve_info *ve) +{ + struct tdx_module_output out; + + /* + * Called during #VE handling to retrieve the #VE info from the + * TDX module. + * + * This has to be called early in #VE handling. A "nested" #VE which + * occurs before this will raise a #DF and is not recoverable. + * + * The call retrieves the #VE info from the TDX module, which also + * clears the "#VE valid" flag. This must be done before anything else + * because any #VE that occurs while the valid flag is set will lead to + * #DF. + * + * Note, the TDX module treats virtual NMIs as inhibited if the #VE + * valid flag is set. It means that NMI=>#VE will not result in a #DF. + */ + tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out); + + /* Transfer the output parameters */ + ve->exit_reason = out.rcx; + ve->exit_qual = out.rdx; + ve->gla = out.r8; + ve->gpa = out.r9; + ve->instr_len = lower_32_bits(out.r10); + ve->instr_info = upper_32_bits(out.r10); +} + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) +{ + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + + return false; +} + void __init tdx_early_init(void) { u64 cc_mask; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 7924f27f5c8b..72184b0b2219 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -632,6 +632,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap); #endif +#ifdef CONFIG_INTEL_TDX_GUEST +DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception); +#endif + /* Device interrupts common/spurious */ DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index a33d47abe67d..c4142e7b004c 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -6,6 +6,7 @@ #include #include #include +#include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " @@ -56,6 +57,22 @@ struct tdx_hypercall_args { u64 r15; }; +/* + * Used by the #VE exception handler to gather the #VE exception + * info from the TDX module. This is a software only structure + * and not part of the TDX module/VMM ABI. + */ +struct ve_info { + u64 exit_reason; + u64 exit_qual; + /* Guest Linear (virtual) Address */ + u64 gla; + /* Guest Physical Address */ + u64 gpa; + u32 instr_len; + u32 instr_info; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); @@ -70,6 +87,10 @@ u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); /* Called from __tdx_hypercall() for unrecoverable failure */ void __tdx_hypercall_failed(void); +void tdx_get_ve_info(struct ve_info *ve); + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 608eb63bf044..a58c6bc1cd68 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -69,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = { */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), +#endif }; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index db8d22a0d003..f9fb6530338f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,6 +62,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -1348,6 +1349,91 @@ DEFINE_IDTENTRY(exc_device_not_available) } } +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + return; + + die_addr(VE_FAULT_STR, regs, error_code, 0); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted. + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0); + + cond_local_irq_disable(regs); +} + +#endif + #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { -- cgit v1.2.3 From bfe6ed0c672782ac2a8edffac93b1ba84b0ff984 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:17 +0300 Subject: x86/tdx: Add HLT support for TDX guests The HLT instruction is a privileged instruction, executing it stops instruction execution and places the processor in a HALT state. It is used in kernel for cases like reboot, idle loop and exception fixup handlers. For the idle case, interrupts will be enabled (using STI) before the HLT instruction (this is also called safe_halt()). To support the HLT instruction in TDX guests, it needs to be emulated using TDVMCALL (hypercall to VMM). More details about it can be found in Intel Trust Domain Extensions (Intel TDX) Guest-Host-Communication Interface (GHCI) specification, section TDVMCALL[Instruction.HLT]. In TDX guests, executing HLT instruction will generate a #VE, which is used to emulate the HLT instruction. But #VE based emulation will not work for the safe_halt() flavor, because it requires STI instruction to be executed just before the TDCALL. Since idle loop is the only user of safe_halt() variant, handle it as a special case. To avoid *safe_halt() call in the idle function, define the tdx_guest_idle() and use it to override the "x86_idle" function pointer for a valid TDX guest. Alternative choices like PV ops have been considered for adding safe_halt() support. But it was rejected because HLT paravirt calls only exist under PARAVIRT_XXL, and enabling it in TDX guest just for safe_halt() use case is not worth the cost. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-9-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdcall.S | 13 +++++++ arch/x86/coco/tdx/tdx.c | 93 +++++++++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/tdx.h | 4 ++ arch/x86/kernel/process.c | 4 ++ 4 files changed, 112 insertions(+), 2 deletions(-) diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index 662479ccf630..245888290bb6 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -139,6 +139,19 @@ SYM_FUNC_START(__tdx_hypercall) movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx + /* + * For the idle loop STI needs to be called directly before the TDCALL + * that enters idle (EXIT_REASON_HLT case). STI instruction enables + * interrupts only one instruction later. If there is a window between + * STI and the instruction that emulates the HALT state, there is a + * chance for interrupts to happen in this window, which can delay the + * HLT operation indefinitely. Since this is the not the desired + * result, conditionally call STI before TDCALL. + */ + testq $TDX_HCALL_ISSUE_STI, %rsi + jz .Lskip_sti + sti +.Lskip_sti: tdcall /* diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 60a3f2ff5b95..ed7302581cc7 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -7,6 +7,7 @@ #include #include #include +#include /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 @@ -36,6 +37,17 @@ void __tdx_hypercall_failed(void) panic("TDVMCALL failed. TDX module bug?"); } +/* + * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined + * independently from but are currently matched 1:1 with VMX EXIT_REASONs. + * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and + * guest sides of these calls. + */ +static u64 hcall_func(u64 exit_reason) +{ + return exit_reason; +} + /* * Used for TDX guests to make calls directly to the TD module. This * should only be used for calls that have no legitimate reason to fail @@ -74,6 +86,62 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_HLT), + .r12 = irq_disabled, + }; + + /* + * Emulate HLT operation via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI), section 3.8 TDG.VP.VMCALL. + * + * The VMM uses the "IRQ disabled" param to understand IRQ + * enabled status (RFLAGS.IF) of the TD guest and to determine + * whether or not it should schedule the halted vCPU if an + * IRQ becomes pending. E.g. if IRQs are disabled, the VMM + * can keep the vCPU in virtual HLT, even if an IRQ is + * pending, without hanging/breaking the guest. + */ + return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); +} + +static bool handle_halt(void) +{ + /* + * Since non safe halt is mainly used in CPU offlining + * and the guest will always stay in the halt state, don't + * call the STI instruction (set do_sti as false). + */ + const bool irq_disabled = irqs_disabled(); + const bool do_sti = false; + + if (__halt(irq_disabled, do_sti)) + return false; + + return true; +} + +void __cpuidle tdx_safe_halt(void) +{ + /* + * For do_sti=true case, __tdx_hypercall() function enables + * interrupts using the STI instruction before the TDCALL. So + * set irq_disabled as false. + */ + const bool irq_disabled = false; + const bool do_sti = true; + + /* + * Use WARN_ONCE() to report the failure. + */ + if (__halt(irq_disabled, do_sti)) + WARN_ONCE(1, "HLT instruction emulation failed\n"); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -104,11 +172,32 @@ void tdx_get_ve_info(struct ve_info *ve) ve->instr_info = upper_32_bits(out.r10); } +/* Handle the kernel #VE */ +static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_HLT: + return handle_halt(); + default: + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + return false; + } +} + bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) { - pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + bool ret; + + if (user_mode(regs)) + ret = false; + else + ret = virt_exception_kernel(regs, ve); + + /* After successful #VE handling, move the IP */ + if (ret) + regs->ip += ve->instr_len; - return false; + return ret; } void __init tdx_early_init(void) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index c4142e7b004c..cbd61e142f4e 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -14,6 +14,7 @@ #define TDX_HYPERCALL_STANDARD 0 #define TDX_HCALL_HAS_OUTPUT BIT(0) +#define TDX_HCALL_ISSUE_STI BIT(1) /* * SW-defined error codes. @@ -91,9 +92,12 @@ void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); +void tdx_safe_halt(void); + #else static inline void tdx_early_init(void) { }; +static inline void tdx_safe_halt(void) { }; #endif /* CONFIG_INTEL_TDX_GUEST */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..dbaf12c43fe1 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "process.h" @@ -873,6 +874,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } -- cgit v1.2.3 From ae87f609cd52825fa7fa36f02b29e4357fd29eaa Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:18 +0300 Subject: x86/tdx: Add MSR support for TDX guests Use hypercall to emulate MSR read/write for the TDX platform. There are two viable approaches for doing MSRs in a TD guest: 1. Execute the RDMSR/WRMSR instructions like most VMs and bare metal do. Some will succeed, others will cause a #VE. All of those that cause a #VE will be handled with a TDCALL. 2. Use paravirt infrastructure. The paravirt hook has to keep a list of which MSRs would cause a #VE and use a TDCALL. All other MSRs execute RDMSR/WRMSR instructions directly. The second option can be ruled out because the list of MSRs was challenging to maintain. That leaves option #1 as the only viable solution for the minimal TDX support. Kernel relies on the exception fixup machinery to handle MSR access errors. #VE handler uses the same exception fixup code as #GP. It covers MSR accesses along with other types of fixups. For performance-critical MSR writes (like TSC_DEADLINE), future patches will replace the WRMSR/#VE sequence with the direct TDCALL. RDMSR and WRMSR specification details can be found in Guest-Host-Communication Interface (GHCI) for Intel Trust Domain Extensions (Intel TDX) specification, sec titled "TDG.VP. VMCALL" and "TDG.VP.VMCALL". Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-10-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index ed7302581cc7..00ff0a830970 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -142,6 +142,44 @@ void __cpuidle tdx_safe_halt(void) WARN_ONCE(1, "HLT instruction emulation failed\n"); } +static bool read_msr(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_MSR_READ), + .r12 = regs->cx, + }; + + /* + * Emulate the MSR read via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI), section titled "TDG.VP.VMCALL". + */ + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + + regs->ax = lower_32_bits(args.r11); + regs->dx = upper_32_bits(args.r11); + return true; +} + +static bool write_msr(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_MSR_WRITE), + .r12 = regs->cx, + .r13 = (u64)regs->dx << 32 | regs->ax, + }; + + /* + * Emulate the MSR write via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI) section titled "TDG.VP.VMCALL". + */ + return !__tdx_hypercall(&args, 0); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -178,6 +216,10 @@ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) switch (ve->exit_reason) { case EXIT_REASON_HLT: return handle_halt(); + case EXIT_REASON_MSR_READ: + return read_msr(regs); + case EXIT_REASON_MSR_WRITE: + return write_msr(regs); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); return false; -- cgit v1.2.3 From c141fa2c2bbaff0d1f2dc51160626dd22bc16ae2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:19 +0300 Subject: x86/tdx: Handle CPUID via #VE In TDX guests, most CPUID leaf/sub-leaf combinations are virtualized by the TDX module while some trigger #VE. Implement the #VE handling for EXIT_REASON_CPUID by handing it through the hypercall, which in turn lets the TDX module handle it by invoking the host VMM. More details on CPUID Virtualization can be found in the TDX module specification, the section titled "CPUID Virtualization". Note that VMM that handles the hypercall is not trusted. It can return data that may steer the guest kernel in wrong direct. Only allow VMM to control range reserved for hypervisor communication. Return all-zeros for any CPUID outside the hypervisor range. It matches CPU behaviour for non-supported leaf. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-11-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 00ff0a830970..50c3b97d6db7 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -180,6 +180,48 @@ static bool write_msr(struct pt_regs *regs) return !__tdx_hypercall(&args, 0); } +static bool handle_cpuid(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_CPUID), + .r12 = regs->ax, + .r13 = regs->cx, + }; + + /* + * Only allow VMM to control range reserved for hypervisor + * communication. + * + * Return all-zeros for any CPUID outside the range. It matches CPU + * behaviour for non-supported leaf. + */ + if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { + regs->ax = regs->bx = regs->cx = regs->dx = 0; + return true; + } + + /* + * Emulate the CPUID instruction via a hypercall. More info about + * ABI can be found in TDX Guest-Host-Communication Interface + * (GHCI), section titled "VP.VMCALL". + */ + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + + /* + * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of + * EAX, EBX, ECX, EDX registers after the CPUID instruction execution. + * So copy the register contents back to pt_regs. + */ + regs->ax = args.r12; + regs->bx = args.r13; + regs->cx = args.r14; + regs->dx = args.r15; + + return true; +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -210,6 +252,18 @@ void tdx_get_ve_info(struct ve_info *ve) ve->instr_info = upper_32_bits(out.r10); } +/* Handle the user initiated #VE */ +static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_CPUID: + return handle_cpuid(regs); + default: + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + return false; + } +} + /* Handle the kernel #VE */ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) { @@ -220,6 +274,8 @@ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) return read_msr(regs); case EXIT_REASON_MSR_WRITE: return write_msr(regs); + case EXIT_REASON_CPUID: + return handle_cpuid(regs); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); return false; @@ -231,7 +287,7 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) bool ret; if (user_mode(regs)) - ret = false; + ret = virt_exception_user(regs, ve); else ret = virt_exception_kernel(regs, ve); -- cgit v1.2.3 From 31d58c4e557d46fa7f8557714250fb6f89c941ae Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:20 +0300 Subject: x86/tdx: Handle in-kernel MMIO In non-TDX VMs, MMIO is implemented by providing the guest a mapping which will cause a VMEXIT on access and then the VMM emulating the instruction that caused the VMEXIT. That's not possible for TDX VM. To emulate an instruction an emulator needs two things: - R/W access to the register file to read/modify instruction arguments and see RIP of the faulted instruction. - Read access to memory where instruction is placed to see what to emulate. In this case it is guest kernel text. Both of them are not available to VMM in TDX environment: - Register file is never exposed to VMM. When a TD exits to the module, it saves registers into the state-save area allocated for that TD. The module then scrubs these registers before returning execution control to the VMM, to help prevent leakage of TD state. - TDX does not allow guests to execute from shared memory. All executed instructions