diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
| -rw-r--r-- | arch/x86/kvm/vmx.c | 15252 |
1 files changed, 0 insertions, 15252 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c deleted file mode 100644 index 8d5d984541be..000000000000 --- a/arch/x86/kvm/vmx.c +++ /dev/null @@ -1,15252 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "irq.h" -#include "mmu.h" -#include "cpuid.h" -#include "lapic.h" -#include "hyperv.h" - -#include <linux/kvm_host.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/moduleparam.h> -#include <linux/mod_devicetable.h> -#include <linux/trace_events.h> -#include <linux/slab.h> -#include <linux/tboot.h> -#include <linux/hrtimer.h> -#include <linux/frame.h> -#include <linux/nospec.h> -#include "kvm_cache_regs.h" -#include "x86.h" - -#include <asm/asm.h> -#include <asm/cpu.h> -#include <asm/io.h> -#include <asm/desc.h> -#include <asm/vmx.h> -#include <asm/virtext.h> -#include <asm/mce.h> -#include <asm/fpu/internal.h> -#include <asm/perf_event.h> -#include <asm/debugreg.h> -#include <asm/kexec.h> -#include <asm/apic.h> -#include <asm/irq_remapping.h> -#include <asm/mmu_context.h> -#include <asm/spec-ctrl.h> -#include <asm/mshyperv.h> - -#include "trace.h" -#include "pmu.h" -#include "vmx_evmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -static const struct x86_cpu_id vmx_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_VMX), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); - -static bool __read_mostly enable_vpid = 1; -module_param_named(vpid, enable_vpid, bool, 0444); - -static bool __read_mostly enable_vnmi = 1; -module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); - -static bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); - -static bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); - -static bool __read_mostly enable_unrestricted_guest = 1; -module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); - -static bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); - -static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); - -static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); - -static bool __read_mostly enable_apicv = 1; -module_param(enable_apicv, bool, S_IRUGO); - -static bool __read_mostly enable_shadow_vmcs = 1; -module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); -/* - * If nested=1, nested virtualization is supported, i.e., guests may use - * VMX and be a hypervisor for its own guests. If nested=0, guests may not - * use VMX instructions. - */ -static bool __read_mostly nested = 1; -module_param(nested, bool, S_IRUGO); - -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); - -static u64 __read_mostly host_xss; - -static bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); - -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 - -#define MSR_BITMAP_MODE_X2APIC 1 -#define MSR_BITMAP_MODE_X2APIC_APICV 2 - -#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL - -/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ -static int __read_mostly cpu_preemption_timer_multi; -static bool __read_mostly enable_preemption_timer = 1; -#ifdef CONFIG_X86_64 -module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); -#endif - -#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) - -#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - -#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) - -#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 - -/* - * Hyper-V requires all of these, so mark them as supported even though - * they are just treated the same as all-context. - */ -#define VMX_VPID_EXTENT_SUPPORTED_MASK \ - (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ - VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) - -/* - * These 2 parameters are used to config the controls for Pause-Loop Exiting: - * ple_gap: upper bound on the amount of time between two successive - * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually smaller than 128 cycles. - * ple_window: upper bound on the amount of time a guest is allowed to execute - * in a PAUSE loop. Tests indicate that most spinlocks are held for - * less than 2^12 cycles - * Time is measured based on a counter that runs at the same rate as the TSC, - * refer SDM volume 3b section 21.6.13 & 22.1.3. - */ -static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; -module_param(ple_gap, uint, 0444); - -static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, uint, 0444); - -/* Default doubles per-vcpu window every exit. */ -static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; -module_param(ple_window_grow, uint, 0444); - -/* Default resets per-vcpu window every exit to ple_window. */ -static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; -module_param(ple_window_shrink, uint, 0444); - -/* Default is to compute the maximum so we can never overflow. */ -static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -module_param(ple_window_max, uint, 0444); - -extern const ulong vmx_return; -extern const ulong vmx_early_consistency_check_return; - -static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); -static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); -static DEFINE_MUTEX(vmx_l1d_flush_mutex); - -/* Storage for pre module init parameter parsing */ -static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; - -static const struct { - const char *option; - bool for_parse; -} vmentry_l1d_param[] = { - [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, - [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, - [VMENTER_L1D_FLUSH_COND] = {"cond", true}, - [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, - [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, - [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, -}; - -#define L1D_CACHE_ORDER 4 -static void *vmx_l1d_flush_pages; - -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) -{ - struct page *page; - unsigned int i; - - if (!enable_ept) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; - return 0; - } - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; - - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } - } - - /* If set to auto use the default l1tf mitigation method */ - if (l1tf == VMENTER_L1D_FLUSH_AUTO) { - switch (l1tf_mitigation) { - case L1TF_MITIGATION_OFF: - l1tf = VMENTER_L1D_FLUSH_NEVER; - break; - case L1TF_MITIGATION_FLUSH_NOWARN: - case L1TF_MITIGATION_FLUSH: - case L1TF_MITIGATION_FLUSH_NOSMT: - l1tf = VMENTER_L1D_FLUSH_COND; - break; - case L1TF_MITIGATION_FULL: - case L1TF_MITIGATION_FULL_FORCE: - l1tf = VMENTER_L1D_FLUSH_ALWAYS; - break; - } - } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { - l1tf = VMENTER_L1D_FLUSH_ALWAYS; - } - - if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && - !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { - page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); - if (!page) - return -ENOMEM; - vmx_l1d_flush_pages = page_address(page); - - /* - * Initialize each page with a different pattern in - * order to protect against KSM in the nested - * virtualization case. - */ - for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { - memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, - PAGE_SIZE); - } - } - - l1tf_vmx_mitigation = l1tf; - - if (l1tf != VMENTER_L1D_FLUSH_NEVER) - static_branch_enable(&vmx_l1d_should_flush); - else - static_branch_disable(&vmx_l1d_should_flush); - - if (l1tf == VMENTER_L1D_FLUSH_COND) - static_branch_enable(&vmx_l1d_flush_cond); - else - static_branch_disable(&vmx_l1d_flush_cond); - return 0; -} - -static int vmentry_l1d_flush_parse(const char *s) -{ - unsigned int i; - - if (s) { - for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { - if (vmentry_l1d_param[i].for_parse && - sysfs_streq(s, vmentry_l1d_param[i].option)) - return i; - } - } - return -EINVAL; -} - -static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) -{ - int l1tf, ret; - - l1tf = vmentry_l1d_flush_parse(s); - if (l1tf < 0) - return l1tf; - - if (!boot_cpu_has(X86_BUG_L1TF)) - return 0; - - /* - * Has vmx_init() run already? If not then this is the pre init - * parameter parsing. In that case just store the value and let - * vmx_init() do the proper setup after enable_ept has been - * established. - */ - if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { - vmentry_l1d_flush_param = l1tf; - return 0; - } - - mutex_lock(&vmx_l1d_flush_mutex); - ret = vmx_setup_l1d_flush(l1tf); - mutex_unlock(&vmx_l1d_flush_mutex); - return ret; -} - -static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) -{ - if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) - return sprintf(s, "???\n"); - - return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); -} - -static const struct kernel_param_ops vmentry_l1d_flush_ops = { - .set = vmentry_l1d_flush_set, - .get = vmentry_l1d_flush_get, -}; -module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); - -enum ept_pointers_status { - EPT_POINTERS_CHECK = 0, - EPT_POINTERS_MATCH = 1, - EPT_POINTERS_MISMATCH = 2 -}; - -struct kvm_vmx { - struct kvm kvm; - - unsigned int tss_addr; - bool ept_identity_pagetable_done; - gpa_t ept_identity_map_addr; - - enum ept_pointers_status ept_pointers_match; - spinlock_t ept_pointer_lock; -}; - -#define NR_AUTOLOAD_MSRS 8 - -struct vmcs_hdr { - u32 revision_id:31; - u32 shadow_vmcs:1; -}; - -struct vmcs { - struct vmcs_hdr hdr; - u32 abort; - char data[0]; -}; - -/* - * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT - * and whose values change infrequently, but are not constant. I.e. this is - * used as a write-through cache of the corresponding VMCS fields. - */ -struct vmcs_host_state { - unsigned long cr3; /* May not match real cr3 */ - unsigned long cr4; /* May not match real cr4 */ - unsigned long gs_base; - unsigned long fs_base; - - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif -}; - -/* - * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also - * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs - * loaded on this CPU (so we can clear them if the CPU goes down). - */ -struct loaded_vmcs { - struct vmcs *vmcs; - struct vmcs *shadow_vmcs; - int cpu; - bool launched; - bool nmi_known_unmasked; - bool hv_timer_armed; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; - unsigned long *msr_bitmap; - struct list_head loaded_vmcss_on_cpu_link; - struct vmcs_host_state host_state; -}; - -struct shared_msr_entry { - unsigned index; - u64 data; - u64 mask; -}; - -/* - * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a - * single nested guest (L2), hence the name vmcs12. Any VMX implementation has - * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is - * stored in guest memory specified by VMPTRLD, but is opaque to the guest, - * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. - * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the - * underlying hardware which will be used to run L2. - * This structure is packed to ensure that its layout is identical across - * machines (necessary for live migration). - * - * IMPORTANT: Changing the layout of existing fields in this structure - * will break save/restore compatibility with older kvm releases. When - * adding new fields, either use space in the reserved padding* arrays - * or add the new fields to the end of the structure. - */ -typedef u64 natural_width; -struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with the - * following two fields. Then follow implementation-specific data. - */ - struct vmcs_hdr hdr; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 posted_intr_desc_addr; - u64 ept_pointer; - u64 eoi_exit_bitmap0; - u64 eoi_exit_bitmap1; - u64 eoi_exit_bitmap2; - u64 eoi_exit_bitmap3; - u64 xss_exit_bitmap; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 guest_bndcfgs; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 host_ia32_perf_global_ctrl; - u64 vmread_bitmap; - u64 vmwrite_bitmap; - u64 vm_function_control; - u64 eptp_list_address; - u64 pml_address; - u64 padding64[3]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 vmx_preemption_timer_value; - u32 padding32[7]; /* room for future expansion */ - u16 virtual_processor_id; - u16 posted_intr_nv; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 guest_intr_status; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - u16 guest_pml_index; -}; - -/* - * For save/restore compatibility, the vmcs12 field offsets must not change. - */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") - -static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(hdr, 0); - CHECK_OFFSET(abort, 4); - CHECK_OFFSET(launch_state, 8); - CHECK_OFFSET(io_bitmap_a, 40); - CHECK_OFFSET(io_bitmap_b, 48); - CHECK_OFFSET(msr_bitmap, 56); - CHECK_OFFSET(vm_exit_msr_store_addr, 64); - CHECK_OFFSET(vm_exit_msr_load_addr, 72); - CHECK_OFFSET(vm_entry_msr_load_addr, 80); - CHECK_OFFSET(tsc_offset, 88); - CHECK_OFFSET(virtual_apic_page_addr, 96); - CHECK_OFFSET(apic_access_addr, 104); - CHECK_OFFSET(posted_intr_desc_addr, 112); - CHECK_OFFSET(ept_pointer, 120); - CHECK_OFFSET(eoi_exit_bitmap0, 128); - CHECK_OFFSET(eoi_exit_bitmap1, 136); - CHECK_OFFSET(eoi_exit_bitmap2, 144); - CHECK_OFFSET(eoi_exit_bitmap3, 152); - CHECK_OFFSET(xss_exit_bitmap, 160); - CHECK_OFFSET(guest_physical_address, 168); - CHECK_OFFSET(vmcs_link_pointer, 176); - CHECK_OFFSET(guest_ia32_debugctl, 184); - CHECK_OFFSET(guest_ia32_pat, 192); - CHECK_OFFSET(guest_ia32_efer, 200); - CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); - CHECK_OFFSET(guest_pdptr0, 216); - CHECK_OFFSET(guest_pdptr1, 224); - CHECK_OFFSET(guest_pdptr2, 232); - CHECK_OFFSET(guest_pdptr3, 240); - CHECK_OFFSET(guest_bndcfgs, 248); - CHECK_OFFSET(host_ia32_pat, 256); - CHECK_OFFSET(host_ia32_efer, 264); - CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); - CHECK_OFFSET(vmread_bitmap, 280); - CHECK_OFFSET(vmwrite_bitmap, 288); - CHECK_OFFSET(vm_function_control, 296); - CHECK_OFFSET(eptp_list_address, 304); - CHECK_OFFSET(pml_address, 312); - CHECK_OFFSET(cr0_guest_host_mask, 344); - CHECK_OFFSET(cr4_guest_host_mask, 352); - CHECK_OFFSET(cr0_read_shadow, 360); - CHECK_OFFSET(cr4_read_shadow, 368); - CHECK_OFFSET(cr3_target_value0, 376); - CHECK_OFFSET(cr3_target_value1, 384); - CHECK_OFFSET(cr3_target_value2, 392); - CHECK_OFFSET(cr3_target_value3, 400); - CHECK_OFFSET(exit_qualification, 408); - CHECK_OFFSET(guest_linear_address, 416); - CHECK_OFFSET(guest_cr0, 424); - CHECK_OFFSET(guest_cr3, 432); - CHECK_OFFSET(guest_cr4, 440); - CHECK_OFFSET(guest_es_base, 448); - CHECK_OFFSET(guest_cs_base, 456); - CHECK_OFFSET(guest_ss_base, 464); - CHECK_OFFSET(guest_ds_base, 472); - CHECK_OFFSET(guest_fs_base, 480); - CHECK_OFFSET(guest_gs_base, 488); - CHECK_OFFSET(guest_ldtr_base, 496); - CHECK_OFFSET(guest_tr_base, 504); - CHECK_OFFSET(guest_gdtr_base, 512); - CHECK_OFFSET(guest_idtr_base, 520); - CHECK_OFFSET(guest_dr7, 528); - CHECK_OFFSET(guest_rsp, 536); - CHECK_OFFSET(guest_rip, 544); - CHECK_OFFSET(guest_rflags, 552); - CHECK_OFFSET(guest_pending_dbg_exceptions, 560); - CHECK_OFFSET(guest_sysenter_esp, 568); - CHECK_OFFSET(guest_sysenter_eip, 576); - CHECK_OFFSET(host_cr0, 584); - CHECK_OFFSET(host_cr3, 592); - CHECK_OFFSET(host_cr4, 600); - CHECK_OFFSET(host_fs_base, 608); - CHECK_OFFSET(host_gs_base, 616); - CHECK_OFFSET(host_tr_base, 624); - CHECK_OFFSET(host_gdtr_base, 632); - CHECK_OFFSET(host_idtr_base, 640); - CHECK_OFFSET(host_ia32_sysenter_esp, 648); - CHECK_OFFSET(host_ia32_sysenter_eip, 656); - CHECK_OFFSET(host_rsp, 664); - CHECK_OFFSET(host_rip, 672); - CHECK_OFFSET(pin_based_vm_exec_control, 744); - CHECK_OFFSET(cpu_based_vm_exec_control, 748); - CHECK_OFFSET(exception_bitmap, 752); - CHECK_OFFSET(page_fault_error_code_mask, 756); - CHECK_OFFSET(page_fault_error_code_match, 760); - CHECK_OFFSET(cr3_target_count, 764); - CHECK_OFFSET(vm_exit_controls, 768); - CHECK_OFFSET(vm_exit_msr_store_count, 772); - CHECK_OFFSET(vm_exit_msr_load_count, 776); - CHECK_OFFSET(vm_entry_controls, 780); - CHECK_OFFSET(vm_entry_msr_load_count, 784); - CHECK_OFFSET(vm_entry_intr_info_field, 788); - CHECK_OFFSET(vm_entry_exception_error_code, 792); - CHECK_OFFSET(vm_entry_instruction_len, 796); - CHECK_OFFSET(tpr_threshold, 800); - CHECK_OFFSET(secondary_vm_exec_control, 804); - CHECK_OFFSET(vm_instruction_error, 808); - CHECK_OFFSET(vm_exit_reason, 812); - CHECK_OFFSET(vm_exit_intr_info, 816); - CHECK_OFFSET(vm_exit_intr_error_code, 820); - CHECK_OFFSET(idt_vectoring_info_field, 824); - CHECK_OFFSET(idt_vectoring_error_code, 828); - CHECK_OFFSET(vm_exit_instruction_len, 832); - CHECK_OFFSET(vmx_instruction_info, 836); - CHECK_OFFSET(guest_es_limit, 840); - CHECK_OFFSET(guest_cs_limit, 844); - CHECK_OFFSET(guest_ss_limit, 848); - CHECK_OFFSET(guest_ds_limit, 852); - CHECK_OFFSET(guest_fs_limit, 856); - CHECK_OFFSET(guest_gs_limit, 860); - CHECK_OFFSET(guest_ldtr_limit, 864); - CHECK_OFFSET(guest_tr_limit, 868); - CHECK_OFFSET(guest_gdtr_limit, 872); - CHECK_OFFSET(guest_idtr_limit, 876); - CHECK_OFFSET(guest_es_ar_bytes, 880); - CHECK_OFFSET(guest_cs_ar_bytes, 884); - CHECK_OFFSET(guest_ss_ar_bytes, 888); - CHECK_OFFSET(guest_ds_ar_bytes, 892); - CHECK_OFFSET(guest_fs_ar_bytes, 896); - CHECK_OFFSET(guest_gs_ar_bytes, 900); - CHECK_OFFSET(guest_ldtr_ar_bytes, 904); - CHECK_OFFSET(guest_tr_ar_bytes, 908); - CHECK_OFFSET(guest_interruptibility_info, 912); - CHECK_OFFSET(guest_activity_state, 916); - CHECK_OFFSET(guest_sysenter_cs, 920); - CHECK_OFFSET(host_ia32_sysenter_cs, 924); - CHECK_OFFSET(vmx_preemption_timer_value, 928); - CHECK_OFFSET(virtual_processor_id, 960); - CHECK_OFFSET(posted_intr_nv, 962); - CHECK_OFFSET(guest_es_selector, 964); - CHECK_OFFSET(guest_cs_selector, 966); - CHECK_OFFSET(guest_ss_selector, 968); - CHECK_OFFSET(guest_ds_selector, 970); - CHECK_OFFSET(guest_fs_selector, 972); - CHECK_OFFSET(guest_gs_selector, 974); - CHECK_OFFSET(guest_ldtr_selector, 976); - CHECK_OFFSET(guest_tr_selector, 978); - CHECK_OFFSET(guest_intr_status, 980); - CHECK_OFFSET(host_es_selector, 982); - CHECK_OFFSET(host_cs_selector, 984); - CHECK_OFFSET(host_ss_selector, 986); - CHECK_OFFSET(host_ds_selector, 988); - CHECK_OFFSET(host_fs_selector, 990); - CHECK_OFFSET(host_gs_selector, 992); - CHECK_OFFSET(host_tr_selector, 994); - CHECK_OFFSET(guest_pml_index, 996); -} - -/* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. - * - * IMPORTANT: Changing this value will break save/restore compatibility with - * older kvm releases. - */ -#define VMCS12_REVISION 0x11e57ed0 - -/* - * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region - * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. - */ -#define VMCS12_SIZE 0x1000 - -/* - * VMCS12_MAX_FIELD_INDEX is the highest index value used in any - * supported VMCS12 field encoding. - */ -#define VMCS12_MAX_FIELD_INDEX 0x17 - -struct nested_vmx_msrs { - /* - * We only store the "true" versions of the VMX capability MSRs. We - * generate the "non-true" versions by setting the must-be-1 bits - * according to the SDM. - */ - u32 procbased_ctls_low; - u32 procbased_ctls_high; - u32 secondary_ctls_low; - u32 secondary_ctls_high; - u32 pinbased_ctls_low; - u32 pinbased_ctls_high; - u32 exit_ctls_low; - u32 exit_ctls_high; - u32 entry_ctls_low; - u32 entry_ctls_high; - u32 misc_low; - u32 misc_high; - u32 ept_caps; - u32 vpid_caps; - u64 basic; - u64 cr0_fixed0; - u64 cr0_fixed1; - u64 cr4_fixed0; - u64 cr4_fixed1; - u64 vmcs_enum; - u64 vmfunc_controls; -}; - -/* - * The nested_vmx structure is part of vcpu_vmx, and holds information we need - * for correct emulation of VMX (i.e., nested VMX) on this vcpu. - */ -struct nested_vmx { - /* Has the level1 guest done vmxon? */ - bool vmxon; - gpa_t vmxon_ptr; - bool pml_full; - - /* The guest-physical address of the current VMCS L1 keeps for L2 */ - gpa_t current_vmptr; - /* - * Cache of the guest's VMCS, existing outside of guest memory. - * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMCLEAR and VMPTRLD. - */ - struct vmcs12 *cached_vmcs12; - /* - * Cache of the guest's shadow VMCS, existing outside of guest - * memory. Loaded from guest memory during VM entry. Flushed - * to guest memory during VM exit. - */ - struct vmcs12 *cached_shadow_vmcs12; - /* - * Indicates if the shadow vmcs or enlightened vmcs must be updated - * with the data held by struct vmcs12. - */ - bool need_vmcs12_sync; - bool dirty_vmcs12; - - /* - * vmcs02 has been initialized, i.e. state that is constant for - * vmcs02 has been written to the backing VMCS. Initialization - * is delayed until L1 actually attempts to run a nested VM. - */ - bool vmcs02_initialized; - - bool change_vmcs01_virtual_apic_mode; - - /* - * Enlightened VMCS has been enabled. It does not mean that L1 has to - * use it. However, VMX features available to L1 will be limited based - * on what the enlightened VMCS supports. - */ - bool enlightened_vmcs_enabled; - - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - - struct loaded_vmcs vmcs02; - - /* - * Guest pages referred to in the vmcs02 with host-physical - * pointers, so we must keep them pinned while L2 runs. - */ - struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; - struct pi_desc *pi_desc; - bool pi_pending; - u16 posted_intr_nv; - - struct hrtimer preemption_timer; - bool preemption_timer_expired; - - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; - - u16 vpid02; - u16 last_vpid; - - struct nested_vmx_msrs msrs; - - /* SMM related state */ - struct { - /* in VMX operation on SMM entry? */ - bool vmxon; - /* in guest mode on SMM entry? */ - bool guest_mode; - } smm; - - gpa_t hv_evmcs_vmptr; - struct page *hv_evmcs_page; - struct hv_enlightened_vmcs *hv_evmcs; -}; - -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -struct vmx_msrs { - unsigned int nr; - struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; -}; - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - unsigned long host_rsp; - u8 fail; - u8 msr_bitmap_mode; - u32 exit_intr_info; - u32 idt_vectoring_info; - ulong rflags; - struct shared_msr_entry *guest_msrs; - int nmsrs; - int save_nmsrs; - bool guest_msrs_dirty; - unsigned long host_idt_base; -#ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; - u64 msr_guest_kernel_gs_base; -#endif - - u64 arch_capabilities; - u64 spec_ctrl; - - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; - u32 secondary_exec_control; - - /* - * loaded_vmcs points to the VMCS currently used in this vcpu. For a - * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. loaded_cpu_state points - * to the VMCS whose state is loaded into the CPU registers that only - * need to be switched when transitioning to/from the kernel; a NULL - * value indicates that host state is loaded. - */ - struct loaded_vmcs vmcs01; - struct loaded_vmcs *loaded_vmcs; - struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ - struct msr_autoload { - struct vmx_msrs guest; - struct vmx_msrs host; - } msr_autoload; - - struct { - int vm86_active; - ulong save_rflags; - struct kvm_segment segs[8]; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } seg[8]; - } segment_cache; - int vpid; - bool emulation_required; - - u32 exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Support for a guest hypervisor (nested VMX) */ - struct nested_vmx nested; - - /* Dynamic PLE window. */ - int ple_window; - bool ple_window_dirty; - - bool req_immediate_exit; - - /* Support for PML */ -#define PML_ENTITY_NUM 512 - struct page *pml_pg; - - /* apic deadline value in host tsc */ - u64 hv_deadline_tsc; - - u64 current_tsc_ratio; - - u32 host_pkru; - - unsigned long host_debugctlmsr; - - /* - * Only bits masked by msr_ia32_feature_control_valid_bits can be set in - * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included - * in msr_ia32_feature_control_valid_bits. - */ - u64 msr_ia32_feature_control; - u64 msr_ia32_feature_control_valid_bits; - u64 ept_pointer; -}; - -enum segment_cache_field { - SEG_FIELD_SEL = 0, - SEG_FIELD_BASE = 1, - SEG_FIELD_LIMIT = 2, - SEG_FIELD_AR = 3, - - SEG_FIELD_NR = 4 -}; - -static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) -{ - return container_of(kvm, struct kvm_vmx, kvm); -} - -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_vmx, vcpu); -} - -static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) -#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) -#define FIELD64(number, name) \ - FIELD(number, name), \ - [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) - - -static u16 shadow_read_only_fields[] = { -#define SHADOW_FIELD_RO(x) x, -#include "vmx_shadow_fields.h" -}; -static int max_shadow_read_only_fields = - ARRAY_SIZE(shadow_read_only_fields); - -static u16 shadow_read_write_fields[] = { -#define SHADOW_FIELD_RW(x) x, -#include "vmx_shadow_fields.h" -}; -static int max_shadow_read_write_fields = - ARRAY_SIZE(shadow_read_write_fields); - -static const unsigned short vmcs_field_to_offset_table[] = { - FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), - FIELD(POSTED_INTR_NV, posted_intr_nv), - FIELD(GUEST_ES_SELECTOR, guest_es_selector), |
