diff options
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
| -rw-r--r-- | arch/x86/kvm/vmx/nested.c | 5721 |
1 files changed, 5721 insertions, 0 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c new file mode 100644 index 000000000000..3170e291215d --- /dev/null +++ b/arch/x86/kvm/vmx/nested.c @@ -0,0 +1,5721 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/frame.h> +#include <linux/percpu.h> + +#include <asm/debugreg.h> +#include <asm/mmu_context.h> + +#include "cpuid.h" +#include "hyperv.h" +#include "mmu.h" +#include "nested.h" +#include "trace.h" +#include "x86.h" + +static bool __read_mostly enable_shadow_vmcs = 1; +module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); + +static bool __read_mostly nested_early_check = 0; +module_param(nested_early_check, bool, S_IRUGO); + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + +enum { + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) + +static u16 shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x) x, +#include "vmcs_shadow_fields.h" +}; +static int max_shadow_read_only_fields = + ARRAY_SIZE(shadow_read_only_fields); + +static u16 shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x) x, +#include "vmcs_shadow_fields.h" +}; +static int max_shadow_read_write_fields = + ARRAY_SIZE(shadow_read_write_fields); + +void init_vmcs_shadow_fields(void) +{ + int i, j; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + for (i = j = 0; i < max_shadow_read_only_fields; i++) { + u16 field = shadow_read_only_fields[i]; + + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_only_fields || + shadow_read_only_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_only_field %x\n", + field + 1); + + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_only_fields[j] = field; + j++; + } + max_shadow_read_only_fields = j; + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + u16 field = shadow_read_write_fields[i]; + + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_write_fields || + shadow_read_write_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_write_field %x\n", + field + 1); + + /* + * PML and the preemption timer can be emulated, but the + * processor cannot vmwrite to fields that don't exist + * on bare metal. + */ + switch (field) { + case GUEST_PML_INDEX: + if (!cpu_has_vmx_pml()) + continue; + break; + case VMX_PREEMPTION_TIMER_VALUE: + if (!cpu_has_vmx_preemption_timer()) + continue; + break; + case GUEST_INTR_STATUS: + if (!cpu_has_vmx_apicv()) + continue; + break; + default: + break; + } + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_write_fields[j] = field; + j++; + } + max_shadow_read_write_fields = j; +} + +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction (as specified + * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated + * instruction. + */ +static int nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ + return kvm_skip_emulated_instruction(vcpu); +} + +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); +} + +static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +{ + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); +} + +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.hv_evmcs) + return; + + kunmap(vmx->nested.hv_evmcs_page); + kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_page = NULL; + vmx->nested.hv_evmcs = NULL; +} + +/* + * Free whatever needs to be freed from vmx->nested when L1 goes down, or + * just stops using VMX. + */ +static void free_nested(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) + return; + + vmx->nested.vmxon = false; + vmx->nested.smm.vmxon = false; + free_vpid(vmx->nested.vpid02); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; + if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); + vmx->vmcs01.shadow_vmcs = NULL; + } + kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); + /* Unpin physical memory we referred to in the vmcs02 */ + if (vmx->nested.apic_access_page) { + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } + + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + nested_release_evmcs(vcpu); + + free_loaded_vmcs(&vmx->nested.vmcs02); +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == vmcs) + return; + + cpu = get_cpu(); + vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load(vcpu, cpu); + put_cpu(); + + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); +} + +/* + * Ensure that the current vmcs of the logical processor is the + * vmcs01 of the vcpu before calling free_nested(). + */ +void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); + free_nested(vcpu); + vcpu_put(vcpu); +} + +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; + + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) + exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + exit_reason = EXIT_REASON_EPT_VIOLATION; + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); + vmcs12->guest_physical_address = fault->address; +} + +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + WARN_ON(mmu_is_nested(vcpu)); + + vcpu->arch.mmu = &vcpu->arch.guest_mmu; + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.msrs.ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT, + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); + vcpu->arch.mmu->set_cr3 = vmx_set_cr3; + vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; +} + +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + + +/* + * KVM wants to inject page-faults which it got to the guest. This function + * checks whether in a nested guest, we need to inject them to L1 or L2. + */ +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + bool has_payload = vcpu->arch.exception.has_payload; + unsigned long payload = vcpu->arch.exception.payload; + + if (nr == PF_VECTOR) { + if (vcpu->arch.exception.nested_apf) { + *exit_qual = vcpu->arch.apf.nested_apf_token; + return 1; + } + if (nested_vmx_is_page_fault_vmexit(vmcs12, + vcpu->arch.exception.error_code)) { + *exit_qual = has_payload ? payload : vcpu->arch.cr2; + return 1; + } + } else if (vmcs12->exception_bitmap & (1u << nr)) { + if (nr == DB_VECTOR) { + if (!has_payload) { + payload = vcpu->arch.dr6; + payload &= ~(DR6_FIXED_1 | DR6_BT); + payload ^= DR6_RTM; + } + *exit_qual = payload; + } else + *exit_qual = 0; + return 1; + } + + return 0; +} + + +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { + vmcs12->vm_exit_intr_error_code = fault->error_code; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + fault->address); + } else { + kvm_inject_page_fault(vcpu, fault); + } +} + +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) + return -EINVAL; + + return 0; +} + +/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + +/* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. + */ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_nested, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) + /* read-low */ + __clear_bit(msr, msr_bitmap_nested + 0x000 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) + /* write-low */ + __clear_bit(msr, msr_bitmap_nested + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) + /* read-high */ + __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) + /* write-high */ + __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); + + } +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int msr; + struct page *page; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* + * pred_cmd & spec_ctrl are trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + + /* Nothing to do if the MSR bitmap is not in use. */ + if (!cpu_has_vmx_msr_bitmap() || + !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return false; + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !pred_cmd && !spec_ctrl) + return false; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); + if (is_error_page(page)) + return false; + + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it + * just lets the processor take the value from the virtual-APIC page; + * take those 256 bits directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } else { + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = ~0; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_TASKPRI), + MSR_TYPE_W); + + if (nested_cpu_has_vid(vmcs12)) { + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_EOI), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_SELF_IPI), + MSR_TYPE_W); + } + + if (spec_ctrl) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_R | MSR_TYPE_W); + + if (pred_cmd) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, + MSR_TYPE_W); + + kunmap(page); + kvm_release_page_clean(page); + + return true; +} + +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + +static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); +} + +static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + !page_address_valid(vcpu, vmcs12->apic_access_addr)) + return -EINVAL; + else + return 0; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (nested_cpu_has_vid(vmcs12) && + !nested_exit_on_intr(vcpu)) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (!nested_cpu_has_vid(vmcs12) || + !nested_exit_intr_ack_set(vcpu) || + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + u32 count, u64 addr) +{ + int maxphyaddr; + + if (count == 0) + return 0; + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, + vmcs12->vm_exit_msr_load_addr) || + nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, + vmcs12->vm_exit_msr_store_addr)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, + vmcs12->vm_entry_msr_load_addr)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->pml_address)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + struct msr_data msr_info; + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + msr_info.host_initiated = false; + msr_info.index = e.index; + if (kvm_get_msr(vcpu, &msr_info)) { + pr_debug_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_vcpu_write_guest(vcpu, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &msr_info.data, sizeof(msr_info.data))) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, msr_info.data); + return -EINVAL; + } + } + return 0; +} + +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + u32 *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + } + + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3, false); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + + return 0; +} + +/* + * Returns if KVM is able to config CPU to tag TLB entries + * populated by L2 differently than TLB entries populated + * by L1. + * + * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * + * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged + * with different VPID (L1 entries are tagged with vmx->vpid + * while L2 entries are tagged with vmx->nested.vpid02). + */ +static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + return nested_cpu_has_ept(vmcs12) || + (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); +} + +static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; +} + + +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + return fixed_bits_valid(control, low, high); +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.msrs.basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.msrs.basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.msrs.pinbased_ctls_low; + highp = &vmx->nested.msrs.pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.msrs.procbased_ctls_low; + highp = &vmx->nested.msrs.procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.msrs.exit_ctls_low; + highp = &vmx->nested.msrs.exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.msrs.entry_ctls_low; + highp = &vmx->nested.msrs.entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.msrs.secondary_ctls_low; + highp = &vmx->nested.msrs.secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.msrs.pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.msrs.misc_low = data; + vmx->nested.msrs.misc_high = data >> 32; + + /* + * If L1 has read-only VM-exit information fields, use the + * less permissive vmx_vmwrite_bitmap to specify write + * permissions for the shadow VMCS. + */ + if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, + vmx->nested.msrs.vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.msrs.ept_caps = data; + vmx->nested.msrs.vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.msrs.cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.msrs.cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +int vm |
