diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/events/intel/ds.c | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 48 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 13 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 13 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 203 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 28 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmutrace.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 24 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 24 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 53 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 15 |
17 files changed, 341 insertions, 147 deletions
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2641ba620f12..e010bfed8417 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1237,11 +1237,11 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu = event->pmu; /* - * Make sure we get updated with the first PEBS - * event. It will trigger also during removal, but - * that does not hurt: + * Make sure we get updated with the first PEBS event. + * During removal, ->pebs_data_cfg is still valid for + * the last PEBS event. Don't clear it. */ - if (cpuc->n_pebs == 1) + if ((cpuc->n_pebs == 1) && add) cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW; if (needed_cb != pebs_needs_sched_cb(cpuc)) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 01c69840647e..9d6368512be6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -254,28 +254,31 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) +#define PFERR_PRESENT_MASK BIT(0) +#define PFERR_WRITE_MASK BIT(1) +#define PFERR_USER_MASK BIT(2) +#define PFERR_RSVD_MASK BIT(3) +#define PFERR_FETCH_MASK BIT(4) +#define PFERR_PK_MASK BIT(5) +#define PFERR_SGX_MASK BIT(15) +#define PFERR_GUEST_RMP_MASK BIT_ULL(31) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_ENC_MASK BIT_ULL(34) +#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) +#define PFERR_GUEST_VMPL_MASK BIT_ULL(36) -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 -#define PFERR_PK_BIT 5 -#define PFERR_SGX_BIT 15 -#define PFERR_GUEST_FINAL_BIT 32 -#define PFERR_GUEST_PAGE_BIT 33 -#define PFERR_IMPLICIT_ACCESS_BIT 48 - -#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) -#define PFERR_USER_MASK BIT(PFERR_USER_BIT) -#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) -#define PFERR_PK_MASK BIT(PFERR_PK_BIT) -#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) +/* + * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks + * when emulating instructions that triggers implicit access. + */ +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) +/* + * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred + * when the guest was accessing private memory. + */ +#define PFERR_PRIVATE_ACCESS BIT_ULL(49) +#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ @@ -1313,6 +1316,8 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; + u64 shadow_mmio_value; + struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA @@ -1846,6 +1851,7 @@ struct kvm_arch_async_pf { gfn_t gfn; unsigned long cr3; bool direct_map; + u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 4dba17363008..d77a31039f24 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,6 +71,7 @@ #define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING) #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) +#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) #define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) @@ -226,6 +227,8 @@ enum vmcs_field { VMREAD_BITMAP_HIGH = 0x00002027, VMWRITE_BITMAP = 0x00002028, VMWRITE_BITMAP_HIGH = 0x00002029, + VE_INFORMATION_ADDRESS = 0x0000202A, + VE_INFORMATION_ADDRESS_HIGH = 0x0000202B, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, ENCLS_EXITING_BITMAP = 0x0000202E, @@ -514,6 +517,7 @@ enum vmcs_field { #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63) #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ VMX_EPT_WRITABLE_MASK | \ VMX_EPT_EXECUTABLE_MASK) @@ -630,4 +634,13 @@ enum vmx_l1d_flush_state { extern enum vmx_l1d_flush_state l1tf_vmx_mitigation; +struct vmx_ve_information { + u32 exit_reason; + u32 delivery; + u64 exit_qualification; + u64 guest_linear_address; + u64 guest_physical_address; + u16 eptp_index; +}; + #endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 0ebdd088f28b..d64fb2b3eb69 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -95,6 +95,19 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config KVM_INTEL_PROVE_VE + bool "Check that guests do not receive #VE exceptions" + default KVM_PROVE_MMU || DEBUG_KERNEL + depends on KVM_INTEL + help + + Checks that KVM's page table management code will not incorrectly + let guests receive a virtualization exception. Virtualization + exceptions will be trapped by the hypervisor rather than injected + in the guest. + + If unsure, say N. + config X86_SGX_KVM bool "Software Guard eXtensions (SGX) Virtualization" depends on X86_SGX && KVM_INTEL diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 60f21bb4c27b..2343c9f00e31 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; + int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; @@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ - offset = (pfec & ~1) + - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); + offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 08900a0563f9..510eb1117012 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -567,9 +567,9 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else - old_spte = __update_clear_spte_slow(sptep, 0ull); + old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); if (!is_shadow_present_pte(old_spte)) return old_spte; @@ -603,7 +603,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) */ static void mmu_spte_clear_no_track(u64 *sptep) { - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); } static u64 mmu_spte_get_lockless(u64 *sptep) @@ -1897,7 +1897,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - if (!sp->spt[i]) + /* sp->spt[i] has initial value of shadow page table allocation */ + if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0; return vcpu->arch.mmu->sync_spte(vcpu, sp, i); @@ -2461,7 +2462,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } - } else if (is_mmio_spte(pte)) { + } else if (is_mmio_spte(kvm, pte)) { mmu_spte_clear_no_track(spte); } return 0; @@ -3261,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, { gva_t gva = fault->is_tdp ? 0 : fault->addr; + if (fault->is_private) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; + fault->hva = KVM_HVA_ERR_BAD; + /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an @@ -4143,7 +4154,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (WARN_ON_ONCE(reserved)) return -EINVAL; - if (is_mmio_spte(spte)) { + if (is_mmio_spte(vcpu->kvm, spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned int access = get_mmio_spte_access(spte); @@ -4206,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); - arch.gfn = gfn; + arch.gfn = fault->gfn; + arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); - return kvm_setup_async_pf(vcpu, cr2_or_gpa, - kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, fault->addr, + kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; + if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) + return; + if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; @@ -4236,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); } static inline u8 kvm_max_level_for_order(int order) @@ -4256,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) -{ - kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, - PAGE_SIZE, fault->write, fault->exec, - fault->is_private); -} - static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { @@ -4290,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_memory_slot *slot = fault->slot; bool async; - /* - * Retry the page fault if the gfn hit a memslot that is being deleted - * or moved. This ensures any existing SPTEs for the old memslot will - * be zapped before KVM inserts a new MMIO SPTE for the gfn. - */ - if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return RET_PF_RETRY; - - if (!kvm_is_visible_memslot(slot)) { - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu)) { - fault->slot = NULL; - fault->pfn = KVM_PFN_NOSLOT; - fault->map_writable = false; - return RET_PF_CONTINUE; - } - /* - * If the APIC access page exists but is disabled, go directly - * to emulation without caching the MMIO access or creating a - * MMIO SPTE. That way the cache doesn't need to be purged - * when the AVIC is re-enabled. - */ - if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && - !kvm_apicv_activated(vcpu->kvm)) - return RET_PF_EMULATE; - } - - if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { - kvm_mmu_prepare_memory_fault_exit(vcpu, fault); - return -EFAULT; - } - if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, - fault->write, &fault->map_writable, - &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, + &async, fault->write, + &fault->map_writable, &fault->hva); if (!async) return RET_PF_CONTINUE; /* *pfn has correct page already */ @@ -4341,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; - } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { + } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } @@ -4351,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, - fault->write, &fault->map_writable, - &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, + NULL, fault->write, + &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; } static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { + struct kvm_memory_slot *slot = fault->slot; int ret; + /* + * Note that the mmu_invalidate_seq also serves to detect a concurrent + * change in attributes. is_page_fault_stale() will detect an + * invalidation relate to fault->fn and resume the guest without + * installing a mapping in the page tables. + */ + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + /* + * Now that we have a snapshot of mmu_invalidate_seq we can check for a + * private vs. shared mismatch. + */ + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + if (unlikely(!slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + /* + * Retry the page fault if the gfn hit a memslot that is being deleted + * or moved. This ensures any existing SPTEs for the old memslot will + * be zapped before KVM inserts a new MMIO SPTE for the gfn. + */ + if (slot->flags & KVM_MEMSLOT_INVALID) + return RET_PF_RETRY; + + if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { + /* + * Don't map L1's APIC access page into L2, KVM doesn't support + * using APICv/AVIC to accelerate L2 accesses to L1's APIC, + * i.e. the access needs to be emulated. Emulating access to + * L1's APIC is also correct if L1 is accelerating L2's own + * virtual APIC, but for some reason L1 also maps _L1's_ APIC + * into L2. Note, vcpu_is_mmio_gpa() always treats access to + * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM + * uses different roots for L1 vs. L2, i.e. there is no danger + * of breaking APICv/AVIC for L1. + */ + if (is_guest_mode(vcpu)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (!kvm_apicv_activated(vcpu->kvm)) + return RET_PF_EMULATE; + } + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); @@ -4386,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ - if (fault->slot && - mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_faultin_pfn(vcpu, fault); @@ -4397,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); - if (unlikely(!fault->slot)) + if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* @@ -4508,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif + /* + * Legacy #PF exception only have a 32-bit error code. Simply drop the + * upper bits as KVM doesn't use them for #PF (because they are never + * set), and to ensure there are no collisions with KVM-defined bits. + */ + if (WARN_ON_ONCE(error_code >> 32)) + error_code = lower_32_bits(error_code); + + /* Ensure the above sanity check also covers KVM-defined flags. */ + BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { @@ -4759,7 +4797,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { - if (unlikely(is_mmio_spte(*sptep))) { + if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; @@ -5793,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; - /* - * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP - * checks when emulating instructions that triggers implicit access. - * WARN if hardware generates a fault with an error code that collides - * with the KVM-defined value. Clear the flag and continue on, i.e. - * don't terminate the VM, as KVM can't possibly be relying on a flag - * that KVM doesn't know about. - */ - if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) - error_code &= ~PFERR_IMPLICIT_ACCESS; - if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; + /* + * Except for reserved faults (emulated MMIO is shared-only), set the + * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's + * current attributes, which are the source of truth for such VMs. Note, + * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't + * currently supported nested virtualization (among many other things) + * for software-protected VMs. + */ + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && + !(error_code & PFERR_RSVD_MASK) && + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { + if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) + return -EFAULT; + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { - r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), false, + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; @@ -6120,7 +6163,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; - vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu_shadow_page_cache.init_value = + SHADOW_NONPRESENT_VALUE; + if (!vcpu->arch.mmu_shadow_page_cache.init_value) + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -6263,6 +6309,7 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_init_vm(struct kvm *kvm) { + kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5390a591a571..ce2fcd19ba6b 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm) struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; - const u32 error_code; + const u64 error_code; const bool prefetch; /* Derived from error_code. */ @@ -279,8 +279,16 @@ enum { RET_PF_SPURIOUS, }; +static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, + fault->is_private); +} + static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, int *emulation_type) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -298,7 +306,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, - .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), + .is_private = err & PFERR_PRIVATE_ACCESS, + + .pfn = KVM_PFN_ERR_FAULT, + .hva = KVM_HVA_ERR_BAD, }; int r; @@ -320,6 +331,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, else r = vcpu->arch.mmu->page_fault(vcpu, &fault); + /* + * Not sure what's happening, but punt to userspace and hope that + * they can fix it by changing memory to shared, or they can + * provide a better error. + */ + if (r == RET_PF_EMULATE && fault.is_private) { + pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); + kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); + return -EFAULT; + } + if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..195d98bc8de8 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -260,7 +260,7 @@ TRACE_EVENT( TP_STRUCT__entry( __field(int, vcpu_id) __field(gpa_t, cr2_or_gpa) - __field(u32, error_code) + __field(u64, error_code) __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4d4e98fe4f35..9aac3aa93d88 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -911,7 +911,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int gpa_t pte_gpa; gfn_t gfn; - if (WARN_ON_ONCE(!sp->spt[i])) + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE)) return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); @@ -933,13 +933,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int return 0; /* - * Drop the SPTE if the new protections would result in a RWX=0 - * SPTE or if the gfn is changing. The RWX=0 case only affects - * EPT with execute-only support, i.e. EPT without an effective - * "present" bit, as all other paging modes will create a - * read-only SPTE if pte_access is zero. + * Drop the SPTE if the new protections result in no effective + * "present" bit or if the gfn is changing. The former case + * only affects EPT with execute-only support with pte_access==0; + * all other paging modes will create a read-only SPTE if + * pte_access is zero. */ - if ((!pte_access && !shadow_present_mask) || + if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE || gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); return 1; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 6c7ab3aa6aa7..a5e014d7bc62 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -74,10 +74,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!shadow_mmio_value); + WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); access &= shadow_mmio_access_mask; - spte |= shadow_mmio_value | access; + spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; @@ -144,19 +144,19 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; - WARN_ON_ONCE(!pte_access && !shadow_present_mask); + /* + * For the EPT case, shadow_present_mask has no RWX bits set if + * exec-only page table entries are supported. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ spte |= shadow_present_mask; if (!prefetch) spte |= spte_shadow_accessed_mask(spte); @@ -413,7 +413,9 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; - shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ + shadow_present_mask = + (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; /* * EPT overrides the host MTRRs, and so KVM must program the desired * memtype directly into the SPTEs. Note, this mask is just the mask @@ -430,7 +432,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, - VMX_EPT_RWX_MASK, 0); + VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f5c600c52f83..5dd5405fa07a 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -149,6 +149,22 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +/* + * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress + * #VE and get EPT violations on non-present PTEs. We can use the + * same value also without TDX for both VMX and SVM: + * + * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. + * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) + * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) + */ +#ifdef CONFIG_X86_64 +#define SHADOW_NONPRESENT_VALUE BIT_ULL(63) +static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); +#else +#define SHADOW_NONPRESENT_VALUE 0ULL +#endif + extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; @@ -190,11 +206,11 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF - * vulnerability. Use only low bits to avoid 64-bit immediates. + * vulnerability. * * Only used by the TDP MMU. */ -#define REMOVED_SPTE 0x5a0ULL +#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); @@ -249,9 +265,9 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root) return spte_to_child_sp(root); } -static inline bool is_mmio_spte(u64 spte) +static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_value && + return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && likely(enable_mmio_caching); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c6192a52bd31..5fd618abc243 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -495,8 +495,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * impact the guest since both the former and current SPTEs * are nonpresent. */ - if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && - !is_mmio_spte(new_spte) && + if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && + !is_mmio_spte(kvm, new_spte) && !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" @@ -603,7 +603,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * here since the SPTE is going from non-present to non-present. Use * the raw write helper to avoid an unnecessary check on volatile bits. */ - __kvm_tdp_mmu_write_spte(iter->sptep, 0); + __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); return 0; } @@ -740,8 +740,8 @@ retry: continue; if (!shared) - tdp_mmu_iter_set_spte(kvm, &iter, 0); - else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); + else if (tdp_mmu_set_spte_atomic(kvm, & |