diff options
| -rw-r--r-- | arch/x86/include/asm/kvm_host.h | 16 | ||||
| -rw-r--r-- | arch/x86/include/asm/kvm_page_track.h | 67 | ||||
| -rw-r--r-- | arch/x86/kvm/Kconfig | 13 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 319 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 24 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 252 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/page_track.h | 58 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 41 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/spte.c | 6 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/spte.h | 21 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.c | 11 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 33 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 22 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/gvt/gtt.c | 102 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/gvt/gtt.h | 1 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/gvt/gvt.h | 3 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/gvt/kvmgt.c | 120 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/gvt/page_track.c | 10 | ||||
| -rw-r--r-- | include/linux/kvm_host.h | 19 |
20 files changed, 562 insertions, 578 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e3c9ff4146fc..1a4def36d5bb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -288,13 +288,13 @@ struct kvm_kernel_irq_routing_entry; * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to - * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating - * 2 bytes per gfn instead of 4 bytes per gfn. + * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows + * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via - * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create - * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise - * gfn_track will overflow and explosions will ensure. + * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must + * not create more than 2^16-1 upper-level shadow pages at a single gfn, + * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which @@ -1023,7 +1023,7 @@ struct kvm_lpage_info { struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; - unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; + unsigned short *gfn_write_track; }; /* @@ -1265,8 +1265,9 @@ struct kvm_arch { * create an NX huge page (without hanging the guest). */ struct list_head possible_nx_huge_pages; - struct kvm_page_track_notifier_node mmu_sp_tracker; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; +#endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync @@ -1853,7 +1854,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); -void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index eb186bc57f6a..3d040741044b 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,11 +2,9 @@ #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H -enum kvm_page_track_mode { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_MAX, -}; +#include <linux/kvm_types.h> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * The notifier represented by @kvm_page_track_notifier_node is linked into * the head which will be notified when guest is triggering the track event. @@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node { * It is called when guest is writing the write-tracked page * and write emulation is finished at that time. * - * @vcpu: the vcpu where the write access happened. * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. * @node: this node */ - void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes, struct kvm_page_track_notifier_node *node); + void (*track_write)(gpa_t gpa, const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node); + /* - * It is called when memory slot is being moved or removed - * users can drop write-protection for the pages in that memory slot + * Invoked when a memory region is removed from the guest. Or in KVM + * terms, when a memslot is deleted. * - * @kvm: the kvm where memory slot being moved or removed - * @slot: the memory slot being moved or removed - * @node: this node + * @gfn: base gfn of the region being removed + * @nr_pages: number of pages in the to-be-removed region + * @node: this node */ - void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node); + void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages, + struct kvm_page_track_notifier_node *node); }; -int kvm_page_track_init(struct kvm *kvm); -void kvm_page_track_cleanup(struct kvm *kvm); +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); -bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); -int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); - -void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages); - -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode); +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn); +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn); +#else +/* + * Allow defining a node in a structure even if page tracking is disabled, e.g. + * to play nice with testing headers via direct inclusion from the command line. + */ +struct kvm_page_track_notifier_node {}; +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 66dbd1f4d57d..ed90f148140d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -138,6 +138,19 @@ config KVM_XEN If in doubt, say "N". +config KVM_PROVE_MMU + bool "Prove KVM MMU correctness" + depends on DEBUG_KERNEL + depends on KVM + depends on EXPERT + help + Enables runtime assertions in KVM's MMU that are too costly to enable + in anything remotely resembling a production environment, e.g. this + gates code that verifies a to-be-freed page table doesn't have any + present SPTEs. + + If in doubt, say "N". + config KVM_EXTERNAL_WRITE_TRACKING bool diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 92d5a1924fc1..253fb2093d5d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -121,6 +121,8 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes); static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 276157f8496c..e1d011c67cc6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -25,6 +25,7 @@ #include "kvm_cache_regs.h" #include "smm.h" #include "kvm_emulate.h" +#include "page_track.h" #include "cpuid.h" #include "spte.h" @@ -53,7 +54,7 @@ #include <asm/io.h> #include <asm/set_memory.h> #include <asm/vmx.h> -#include <asm/kvm_page_track.h> + #include "trace.h" extern bool itlb_multihit_kvm_mitigation; @@ -115,11 +116,6 @@ static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; -#ifdef MMU_DEBUG -bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - #define PTE_PREFETCH_NUM 8 #include <trace/events/kvm.h> @@ -486,7 +482,7 @@ retry: */ static void mmu_spte_set(u64 *sptep, u64 new_spte) { - WARN_ON(is_shadow_present_pte(*sptep)); + WARN_ON_ONCE(is_shadow_present_pte(*sptep)); __set_spte(sptep, new_spte); } @@ -498,7 +494,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - WARN_ON(!is_shadow_present_pte(new_spte)); + WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { @@ -511,7 +507,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); return old_spte; } @@ -593,7 +589,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) * by a refcounted page, the refcount is elevated. */ page = kvm_pfn_to_refcounted_page(pfn); - WARN_ON(page && !page_count(page)); + WARN_ON_ONCE(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -808,7 +804,7 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->disallow_lpage += count; - WARN_ON(linfo->disallow_lpage < 0); + WARN_ON_ONCE(linfo->disallow_lpage < 0); } } @@ -835,8 +831,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_add_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_add_gfn(kvm, slot, gfn); kvm_mmu_gfn_disallow_lpage(slot, gfn); @@ -882,8 +877,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_remove_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_remove_gfn(kvm, slot, gfn); kvm_mmu_gfn_allow_lpage(slot, gfn); } @@ -937,10 +931,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, int count = 0; if (!rmap_head->val) { - rmap_printk("%p %llx 0->1\n", spte, *spte); rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { - rmap_printk("%p %llx 1->many\n", spte, *spte); desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; @@ -949,7 +941,6 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, rmap_head->val = (unsigned long)desc | 1; ++count; } else { - rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); count = desc->tail_count + desc->spte_count; @@ -969,7 +960,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, return count; } -static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); @@ -980,7 +972,7 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty. */ - BUG_ON(j < 0); + KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); /* * Replace the to-be-freed SPTE with the last valid entry from the head @@ -1005,35 +997,34 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(head_desc); } -static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(struct kvm *kvm, u64 *spte, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i; - if (!rmap_head->val) { - pr_err("%s: %p 0->BUG\n", __func__, spte); - BUG(); - } else if (!(rmap_head->val & 1)) { - rmap_printk("%p 1->0\n", spte); - if ((u64 *)rmap_head->val != spte) { - pr_err("%s: %p 1->BUG\n", __func__, spte); - BUG(); - } + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) + return; + + if (!(rmap_head->val & 1)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) + return; + rmap_head->val = 0; } else { - rmap_printk("%p many->many\n", spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(rmap_head, desc, i); + pte_list_desc_remove_entry(kvm, rmap_head, + desc, i); return; } } desc = desc->more; } - pr_err("%s: %p many->many\n", __func__, spte); - BUG(); + + KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } } @@ -1041,7 +1032,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - pte_list_remove(sptep, rmap_head); + pte_list_remove(kvm, sptep, rmap_head); } /* Return true if at least one SPTE was zapped, false otherwise */ @@ -1116,7 +1107,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - pte_list_remove(spte, rmap_head); + pte_list_remove(kvm, spte, rmap_head); } /* @@ -1208,7 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); - WARN_ON(sp->role.level == PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); @@ -1237,8 +1228,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) !(pt_protect && is_mmu_writable_spte(spte))) return false; - rmap_printk("spte %p %llx\n", sptep, *sptep); - if (pt_protect) spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; @@ -1263,9 +1252,7 @@ static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; - rmap_printk("spte %p %llx\n", sptep, *sptep); - - MMU_WARN_ON(!spte_ad_enabled(spte)); + KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; return mmu_spte_update(sptep, spte); } @@ -1471,14 +1458,11 @@ static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 new_spte; kvm_pfn_t new_pfn; - WARN_ON(pte_huge(pte)); + WARN_ON_ONCE(pte_huge(pte)); new_pfn = pte_pfn(pte); restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - rmap_printk("spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); - need_flush = true; if (pte_write(pte)) { @@ -1706,21 +1690,19 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return young; } -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) +static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) { - u64 *pos; - u64 *end; +#ifdef CONFIG_KVM_PROVE_MMU + int i; - for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++) - if (is_shadow_present_pte(*pos)) { - printk(KERN_ERR "%s: %p %llx\n", __func__, - pos, *pos); - return 0; - } - return 1; -} + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { + if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) + pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", + sp->spt[i], &sp->spt[i], + kvm_mmu_page_get_gfn(sp, i)); + } #endif +} /* * This value is the sum of all of the kvm instances's @@ -1748,7 +1730,8 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { - MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); + kvm_mmu_check_sptes_at_free(sp); + hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -1771,16 +1754,16 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, pte_list_add(cache, parent_pte, &sp->parent_ptes); } -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, +static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(kvm, parent_pte, &sp->parent_ptes); } -static void drop_parent_pte(struct kvm_mmu_page *sp, +static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - mmu_page_remove_parent_pte(sp, parent_pte); + mmu_page_remove_parent_pte(kvm, sp, parent_pte); mmu_spte_clear_no_track(parent_pte); } @@ -1836,7 +1819,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); + WARN_ON_ONCE((int)sp->unsync_children < 0); __clear_bit(idx, sp->unsync_child_bitmap); } @@ -1894,7 +1877,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - WARN_ON(!sp->unsync); + WARN_ON_ONCE(!sp->unsync); trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; @@ -2069,11 +2052,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec, if (pvec->nr == 0) return 0; - WARN_ON(pvec->page[0].idx != INVALID_INDEX); + WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); sp = pvec->page[0].sp; level = sp->role.level; - WARN_ON(level == PG_LEVEL_4K); + WARN_ON_ONCE(level == PG_LEVEL_4K); parents->parent[level-2] = sp; @@ -2095,7 +2078,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) if (!sp) return; - WARN_ON(idx == INVALID_INDEX); + WARN_ON_ONCE(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; } while (!sp->unsync_children); @@ -2216,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, if (ret < 0) break; - WARN_ON(!list_empty(&invalid_list)); + WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0) kvm_flush_remote_tlbs(kvm); } @@ -2495,7 +2478,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (child->role.access == direct_access) return; - drop_parent_pte(child, sptep); + drop_parent_pte(vcpu->kvm, child, sptep); kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } @@ -2513,7 +2496,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, drop_spte(kvm, spte); } else { child = spte_to_child_sp(pte); - drop_parent_pte(child, spte); + drop_parent_pte(kvm, child, spte); /* * Recursively zap nested TDP SPs, parentless SPs are @@ -2544,13 +2527,13 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, return zapped; } -static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) - drop_parent_pte(sp, sptep); + drop_parent_pte(kvm, sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -2589,7 +2572,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); - kvm_mmu_unlink_parents(sp); + kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; @@ -2671,7 +2654,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); list_for_each_entry_safe(sp, nsp, invalid_list, link) { - WARN_ON(!sp->role.invalid || sp->root_count); + WARN_ON_ONCE(!sp->role.invalid || sp->root_count); kvm_mmu_free_shadow_page(sp); } } @@ -2771,12 +2754,9 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); int r; - pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; write_lock(&kvm->mmu_lock); for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - pgprintk("%s: gfn %llx role %x\n", __func__, gfn, - sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } @@ -2827,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM; /* @@ -2869,7 +2849,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, continue; } - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(kvm, sp); } if (locked) @@ -2934,9 +2914,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; - pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, - *sptep, write_fault, gfn); - if (unlikely(is_noslot_pfn(pfn))) { vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); @@ -2953,11 +2930,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, u64 pte = *sptep; child = spte_to_child_sp(pte); - drop_parent_pte(child, sptep); + drop_parent_pte(vcpu->kvm, child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %llx new %llx\n", - spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep); flush = true; } else @@ -2982,8 +2957,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); rmap_add(vcpu, slot, sptep, gfn, pte_access); @@ -3029,7 +3002,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *spte, *start = NULL; int i; - WARN_ON(!sp->role.direct); + WARN_ON_ONCE(!sp->role.direct); i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; @@ -3570,12 +3543,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - /* - * The "root" may be a special root, e.g. a PAE entry, treat it as a - * SPTE to ensure any non-PA bits are dropped. - */ - sp = spte_to_child_sp(*root_hpa); - if (WARN_ON(!sp)) + sp = root_to_sp(*root_hpa); + if (WARN_ON_ONCE(!sp)) return; if (is_tdp_mmu_page(sp)) @@ -3620,7 +3589,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, &invalid_list); if (free_active_root) { - if (to_shadow_page(mmu->root.hpa)) { + if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { + /* Nothing to cleanup for dummy roots. */ + } else if (root_to_sp(mmu->root.hpa)) { mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { @@ -3644,6 +3615,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; + struct kvm_mmu_page *sp; hpa_t root_hpa; int i; @@ -3658,8 +3630,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) if (!VALID_PAGE(root_hpa)) continue; - if (!to_shadow_page(root_hpa) || - to_shadow_page(root_hpa)->role.guest_mode) + sp = root_to_sp(root_hpa); + if (!sp || sp->role.guest_mode) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } @@ -3667,19 +3639,6 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); - -static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) -{ - int ret = 0; - - if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - ret = 1; - } - - return ret; -} - static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) { @@ -3817,8 +3776,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = root_pgd >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { + mmu->root.hpa = kvm_mmu_get_dummy_root(); + return 0; + } /* * On SVM, reading PDPTRs might access guest memory, which might fault @@ -3830,8 +3791,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; - if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) + pdptrs[i] = 0; } } @@ -3998,7 +3959,7 @@ static bool is_unsync_root(hpa_t root) { struct kvm_mmu_page *sp; - if (!VALID_PAGE(root)) + if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) return false; /* @@ -4014,7 +3975,7 @@ static bool is_unsync_root(hpa_t root) * requirement isn't satisfied. */ smp_rmb(); - sp = to_shadow_page(root); + sp = root_to_sp(root); /* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the @@ -4044,11 +4005,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; - sp = to_shadow_page(root); if (!is_unsync_root(root)) return; + sp = root_to_sp(root); + write_lock(&vcpu->kvm->mmu_lock); mmu_sync_children(vcpu, sp, true); write_unlock(&vcpu->kvm->mmu_lock); @@ -4190,7 +4152,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) return RET_PF_EMULATE; reserved = get_mmio_spte(vcpu, addr, &spte); - if (WARN_ON(reserved)) + if (WARN_ON_ONCE(reserved)) return -EINVAL; if (is_mmio_spte(spte)) { @@ -4228,7 +4190,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) return true; return false; @@ -4378,7 +4340,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) @@ -4403,6 +4365,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { int r; + /* Dummy roots are used only for shadowing bad guest roots. */ + if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) + return RET_PF_RETRY; + if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; @@ -4439,8 +4405,6 @@ out_unlock: static int nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); - /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault); @@ -4558,9 +4522,19 @@ static void nonpaging_init_context(struct kvm_mmu *context) static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { - return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && - role.word == to_shadow_page(root->hpa)->role.word; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root->hpa)) + return false; + + if (!role.direct && pgd != root->pgd) + return false; + + sp = root_to_sp(root->hpa); + if (WARN_ON_ONCE(!sp)) + return false; + + return role.word == sp->role.word; } /* @@ -4630,11 +4604,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { /* - * For now, limit the caching to 64-bit hosts+VMs in order to avoid - * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs - * later if necessary. + * Limit reuse to 64-bit hosts+VMs without "special" roots in order to + * avoid having to deal with PDPTEs and other complexities. */ - if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa)) + if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); if (VALID_PAGE(mmu->root.hpa)) @@ -4680,9 +4653,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count. */ - if (!new_role.direct) - __clear_sp_write_flooding_count( - to_shadow_page(vcpu->arch.mmu->root.hpa)); + if (!new_role.direct) { + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); + + if (!WARN_ON_ONCE(!sp)) + __clear_sp_write_flooding_count(sp); + } } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); @@ -5449,8 +5425,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments). F |
