diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-01-16 16:15:14 +0200 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-01-16 16:15:14 +0200 |
| commit | 79e06c4c4950be2abd8ca5d2428a8c915aa62c24 (patch) | |
| tree | 0507ef82aa3c7766b7b19163a0351882b7d7c5b5 /virt | |
| parent | cb3f09f9afe5286c0aed7a1c5cc71495de166efb (diff) | |
| parent | c862dcd199759d4a45e65dab47b03e3e8a144e3a (diff) | |
| download | linux-79e06c4c4950be2abd8ca5d2428a8c915aa62c24.tar.gz linux-79e06c4c4950be2abd8ca5d2428a8c915aa62c24.tar.bz2 linux-79e06c4c4950be2abd8ca5d2428a8c915aa62c24.zip | |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"RISCV:
- Use common KVM implementation of MMU memory caches
- SBI v0.2 support for Guest
- Initial KVM selftests support
- Fix to avoid spurious virtual interrupts after clearing hideleg CSR
- Update email address for Anup and Atish
ARM:
- Simplification of the 'vcpu first run' by integrating it into KVM's
'pid change' flow
- Refactoring of the FP and SVE state tracking, also leading to a
simpler state and less shared data between EL1 and EL2 in the nVHE
case
- Tidy up the header file usage for the nvhe hyp object
- New HYP unsharing mechanism, finally allowing pages to be unmapped
from the Stage-1 EL2 page-tables
- Various pKVM cleanups around refcounting and sharing
- A couple of vgic fixes for bugs that would trigger once the vcpu
xarray rework is merged, but not sooner
- Add minimal support for ARMv8.7's PMU extension
- Rework kvm_pgtable initialisation ahead of the NV work
- New selftest for IRQ injection
- Teach selftests about the lack of default IPA space and page sizes
- Expand sysreg selftest to deal with Pointer Authentication
- The usual bunch of cleanups and doc update
s390:
- fix sigp sense/start/stop/inconsistency
- cleanups
x86:
- Clean up some function prototypes more
- improved gfn_to_pfn_cache with proper invalidation, used by Xen
emulation
- add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery
- completely remove potential TOC/TOU races in nested SVM consistency
checks
- update some PMCs on emulated instructions
- Intel AMX support (joint work between Thomas and Intel)
- large MMU cleanups
- module parameter to disable PMU virtualization
- cleanup register cache
- first part of halt handling cleanups
- Hyper-V enlightened MSR bitmap support for nested hypervisors
Generic:
- clean up Makefiles
- introduce CONFIG_HAVE_KVM_DIRTY_RING
- optimize memslot lookup using a tree
- optimize vCPU array usage by converting to xarray"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (268 commits)
x86/fpu: Fix inline prefix warnings
selftest: kvm: Add amx selftest
selftest: kvm: Move struct kvm_x86_state to header
selftest: kvm: Reorder vcpu_load_state steps for AMX
kvm: x86: Disable interception for IA32_XFD on demand
x86/fpu: Provide fpu_sync_guest_vmexit_xfd_state()
kvm: selftests: Add support for KVM_CAP_XSAVE2
kvm: x86: Add support for getting/setting expanded xstate buffer
x86/fpu: Add uabi_size to guest_fpu
kvm: x86: Add CPUID support for Intel AMX
kvm: x86: Add XCR0 support for Intel AMX
kvm: x86: Disable RDMSR interception of IA32_XFD_ERR
kvm: x86: Emulate IA32_XFD_ERR for guest
kvm: x86: Intercept #NM for saving IA32_XFD_ERR
x86/fpu: Prepare xfd_err in struct fpu_guest
kvm: x86: Add emulation for IA32_XFD
x86/fpu: Provide fpu_update_guest_xfd() for IA32_XFD emulation
kvm: x86: Enable dynamic xfeatures at KVM_SET_CPUID2
x86/fpu: Provide fpu_enable_guest_xfd_features() for KVM
x86/fpu: Add guest support to xfd_enable_feature()
...
Diffstat (limited to 'virt')
| -rw-r--r-- | virt/kvm/Kconfig | 6 | ||||
| -rw-r--r-- | virt/kvm/Makefile.kvm | 14 | ||||
| -rw-r--r-- | virt/kvm/async_pf.c | 2 | ||||
| -rw-r--r-- | virt/kvm/dirty_ring.c | 11 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 1066 | ||||
| -rw-r--r-- | virt/kvm/kvm_mm.h | 44 | ||||
| -rw-r--r-- | virt/kvm/mmu_lock.h | 23 | ||||
| -rw-r--r-- | virt/kvm/pfncache.c | 337 |
8 files changed, 1028 insertions, 475 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 62b39149b8c8..f4834c20e4a6 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -4,6 +4,9 @@ config HAVE_KVM bool +config HAVE_KVM_PFNCACHE + bool + config HAVE_KVM_IRQCHIP bool @@ -13,6 +16,9 @@ config HAVE_KVM_IRQFD config HAVE_KVM_IRQ_ROUTING bool +config HAVE_KVM_DIRTY_RING + bool + config HAVE_KVM_EVENTFD bool select EVENTFD diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm new file mode 100644 index 000000000000..2c27d5d0c367 --- /dev/null +++ b/virt/kvm/Makefile.kvm @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Kernel-based Virtual Machine module +# + +KVM ?= ../../../virt/kvm + +kvm-y := $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o +kvm-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o +kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o +kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o +kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o +kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index dd777688d14a..9bfe1d6f6529 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -85,7 +85,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, cr2_or_gpa); - rcuwait_wake_up(&vcpu->wait); + __kvm_vcpu_wake_up(vcpu); mmput(mm); kvm_put_kvm(vcpu->kvm); diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 88f4683198ea..222ecc81d7df 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -9,7 +9,7 @@ #include <linux/vmalloc.h> #include <linux/kvm_dirty_ring.h> #include <trace/events/kvm.h> -#include "mmu_lock.h" +#include "kvm_mm.h" int __weak kvm_cpu_dirty_log_size(void) { @@ -36,15 +36,6 @@ static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring) return kvm_dirty_ring_used(ring) >= ring->size; } -struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - WARN_ON_ONCE(vcpu->kvm != kvm); - - return &vcpu->dirty_ring; -} - static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) { struct kvm_memory_slot *memslot; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 938975ff75a0..504158f0e131 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -59,7 +59,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" -#include "mmu_lock.h" +#include "kvm_mm.h" #include "vfio.h" #define CREATE_TRACE_POINTS @@ -305,8 +305,9 @@ bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, { struct kvm_vcpu *vcpu; struct cpumask *cpus; + unsigned long i; bool called; - int i, me; + int me; me = get_cpu(); @@ -421,7 +422,9 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; +#ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); +#endif kvm_async_pf_vcpu_init(vcpu); vcpu->pre_pcpu = -1; @@ -432,10 +435,10 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - vcpu->last_used_slot = 0; + vcpu->last_used_slot = NULL; } -void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) +static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_dirty_ring_free(&vcpu->dirty_ring); kvm_arch_vcpu_destroy(vcpu); @@ -450,7 +453,20 @@ void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); + +void kvm_destroy_vcpus(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_vcpu_destroy(vcpu); + xa_erase(&kvm->vcpu_array, i); + } + + atomic_set(&kvm->online_vcpus, 0); +} +EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) @@ -498,6 +514,12 @@ static void kvm_null_fn(void) } #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) +/* Iterate over each memslot intersecting [start, last] (inclusive) range */ +#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ + for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ + node; \ + node = interval_tree_iter_next(node, start, last)) \ + static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, const struct kvm_hva_range *range) { @@ -507,6 +529,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, struct kvm_memslots *slots; int i, idx; + if (WARN_ON_ONCE(range->end <= range->start)) + return 0; + /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) @@ -515,15 +540,17 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + struct interval_tree_node *node; + slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { + kvm_for_each_memslot_in_hva_range(node, slots, + range->start, range->end - 1) { unsigned long hva_start, hva_end; + slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); hva_start = max(range->start, slot->userspace_addr); hva_end = min(range->end, slot->userspace_addr + (slot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; /* * To optimize for the likely case where the address @@ -684,6 +711,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mn_active_invalidate_count++; spin_unlock(&kvm->mn_invalidate_lock); + gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, + hva_range.may_block); + __kvm_handle_hva_range(kvm, &hva_range); return 0; @@ -851,21 +881,6 @@ static void kvm_destroy_pm_notifier(struct kvm *kvm) } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ -static struct kvm_memslots *kvm_alloc_memslots(void) -{ - int i; - struct kvm_memslots *slots; - - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); - if (!slots) - return NULL; - - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - slots->id_to_index[i] = -1; - - return slots; -} - static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) @@ -875,27 +890,33 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) memslot->dirty_bitmap = NULL; } +/* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); - slot->flags = 0; - slot->npages = 0; + kfree(slot); } static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) { + struct hlist_node *idnode; struct kvm_memory_slot *memslot; + int bkt; - if (!slots) + /* + * The same memslot objects live in both active and inactive sets, + * arbitrarily free using index '1' so the second invocation of this + * function isn't operating over a structure with dangling pointers + * (even though this function isn't actually touching them). + */ + if (!slots->node_idx) return; - kvm_for_each_memslot(memslot, slots) + hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1]) kvm_free_memslot(kvm, memslot); - - kvfree(slots); } static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) @@ -1034,8 +1055,9 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) static struct kvm *kvm_create_vm(unsigned long type) { struct kvm *kvm = kvm_arch_alloc_vm(); + struct kvm_memslots *slots; int r = -ENOMEM; - int i; + int i, j; if (!kvm) return ERR_PTR(-ENOMEM); @@ -1050,6 +1072,10 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->slots_arch_lock); spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); + xa_init(&kvm->vcpu_array); + + INIT_LIST_HEAD(&kvm->gpc_list); + spin_lock_init(&kvm->gpc_lock); INIT_LIST_HEAD(&kvm->devices); @@ -1062,13 +1088,20 @@ static struct kvm *kvm_create_vm(unsigned long type) refcount_set(&kvm->users_count, 1); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_memslots *slots = kvm_alloc_memslots(); + for (j = 0; j < 2; j++) { + slots = &kvm->__memslots[i][j]; - if (!slots) - goto out_err_no_arch_destroy_vm; - /* Generations must be different for each address space. */ - slots->generation = i; - rcu_assign_pointer(kvm->memslots[i], slots); + atomic_long_set(&slots->last_used_slot, (unsigned long)NULL); + slots->hva_tree = RB_ROOT_CACHED; + slots->gfn_tree = RB_ROOT; + hash_init(slots->id_hash); + slots->node_idx = j; + + /* Generations must be different for each address space. */ + slots->generation = i; + } + + rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); } for (i = 0; i < KVM_NR_BUSES; i++) { @@ -1122,8 +1155,6 @@ out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); @@ -1188,8 +1219,10 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + kvm_free_memslots(kvm, &kvm->__memslots[i][0]); + kvm_free_memslots(kvm, &kvm->__memslots[i][1]); + } cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); kvm_arch_free_vm(kvm); @@ -1259,165 +1292,136 @@ static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) return 0; } -/* - * Delete a memslot by decrementing the number of used slots and shifting all - * other entries in the array forward one spot. - */ -static inline void kvm_memslot_delete(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot) +static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id) { - struct kvm_memory_slot *mslots = slots->memslots; - int i; - - if (WARN_ON(slots->id_to_index[memslot->id] == -1)) - return; + struct kvm_memslots *active = __kvm_memslots(kvm, as_id); + int node_idx_inactive = active->node_idx ^ 1; - slots->used_slots--; - - if (atomic_read(&slots->last_used_slot) >= slots->used_slots) - atomic_set(&slots->last_used_slot, 0); - - for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { - mslots[i] = mslots[i + 1]; - slots->id_to_index[mslots[i].id] = i; - } - mslots[i] = *memslot; - slots->id_to_index[memslot->id] = -1; + return &kvm->__memslots[as_id][node_idx_inactive]; } /* - * "Insert" a new memslot by incrementing the number of used slots. Returns - * the new slot's initial index into the memslots array. + * Helper to get the address space ID when one of memslot pointers may be NULL. + * This also serves as a sanity that at least one of the pointers is non-NULL, + * and that their address space IDs don't diverge. */ -static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) +static int kvm_memslots_get_as_id(struct kvm_memory_slot *a, + struct kvm_memory_slot *b) { - return slots->used_slots++; -} + if (WARN_ON_ONCE(!a && !b)) + return 0; -/* - * Move a changed memslot backwards in the array by shifting existing slots - * with a higher GFN toward the front of the array. Note, the changed memslot - * itself is not preserved in the array, i.e. not swapped at this time, only - * its new index into the array is tracked. Returns the changed memslot's - * current index into the memslots array. - */ -static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot) -{ - struct kvm_memory_slot *mslots = slots->memslots; - int i; + if (!a) + return b->as_id; + if (!b) + return a->as_id; - if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) || - WARN_ON_ONCE(!slots->used_slots)) - return -1; + WARN_ON_ONCE(a->as_id != b->as_id); + return a->as_id; +} - /* - * Move the target memslot backward in the array by shifting existing - * memslots with a higher GFN (than the target memslot) towards the - * front of the array. - */ - for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) { - if (memslot->base_gfn > mslots[i + 1].base_gfn) - break; +static void kvm_insert_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *slot) +{ + struct rb_root *gfn_tree = &slots->gfn_tree; + struct rb_node **node, *parent; + int idx = slots->node_idx; - WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); + parent = NULL; + for (node = &gfn_tree->rb_node; *node; ) { + struct kvm_memory_slot *tmp; - /* Shift the next memslot forward one and update its index. */ - mslots[i] = mslots[i + 1]; - slots->id_to_index[mslots[i].id] = i; + tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); + parent = *node; + if (slot->base_gfn < tmp->base_gfn) + node = &(*node)->rb_left; + else if (slot->base_gfn > tmp->base_gfn) + node = &(*node)->rb_right; + else + BUG(); } - return i; + + rb_link_node(&slot->gfn_node[idx], parent, node); + rb_insert_color(&slot->gfn_node[idx], gfn_tree); } -/* - * Move a changed memslot forwards in the array by shifting existing slots with - * a lower GFN toward the back of the array. Note, the changed memslot itself - * is not preserved in the array, i.e. not swapped at this time, only its new - * index into the array is tracked. Returns the changed memslot's final index - * into the memslots array. - */ -static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot, - int start) +static void kvm_erase_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *mslots = slots->memslots; - int i; + rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree); +} - for (i = start; i > 0; i--) { - if (memslot->base_gfn < mslots[i - 1].base_gfn) - break; +static void kvm_replace_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + int idx = slots->node_idx; - WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); + WARN_ON_ONCE(old->base_gfn != new->base_gfn); - /* Shift the next memslot back one and update its index. */ - mslots[i] = mslots[i - 1]; - slots->id_to_index[mslots[i].id] = i; - } - return i; + rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx], + &slots->gfn_tree); } /* - * Re-sort memslots based on their GFN to account for an added, deleted, or - * moved memslot. Sorting memslots by GFN allows using a binary search during - * memslot lookup. - * - * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry - * at memslots[0] has the highest GFN. - * - * The sorting algorithm takes advantage of having initially sorted memslots - * and knowing the position of the changed memslot. Sorting is also optimized - * by not swapping the updated memslot and instead only shifting other memslots - * and tracking the new index for the update memslot. Only once its final - * index is known is the updated memslot copied into its position in the array. - * - * - When deleting a memslot, the deleted memslot simply needs to be moved to - * the end of the array. - * - * - When creating a memslot, the algorithm "inserts" the new memslot at the - * end of the array and then it forward to its correct location. + * Replace @old with @new in the inactive memslots. * - * - When moving a memslot, the algorithm first moves the updated memslot - * backward to handle the scenario where the memslot's GFN was changed to a - * lower value. update_memslots() then falls through and runs the same flow - * as creating a memslot to move the memslot forward to handle the scenario - * where its GFN was changed to a higher value. + * With NULL @old this simply adds @new. + * With NULL @new this simply removes @old. * - * Note, slots are sorted from highest->lowest instead of lowest->highest for - * historical reasons. Originally, invalid memslots where denoted by having - * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots - * to the end of the array. The current algorithm uses dedicated logic to - * delete a memslot and thus does not rely on invalid memslots having GFN=0. - * - * The other historical motiviation for highest->lowest was to improve the - * performance of memslot lookup. KVM originally used a linear search starting - * at memslots[0]. On x86, the largest memslot usually has one of the highest, - * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a - * single memslot above the 4gb boundary. As the largest memslot is also the - * most likely to be referenced, sorting it to the front of the array was - * advantageous. The current binary search starts from the middle of the array - * and uses an LRU pointer to improve performance for all memslots and GFNs. + * If @new is non-NULL its hva_node[slots_idx] range has to be set + * appropriately. */ -static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot, - enum kvm_mr_change change) +static void kvm_replace_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) { - int i; + int as_id = kvm_memslots_get_as_id(old, new); + struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); + int idx = slots->node_idx; - if (change == KVM_MR_DELETE) { - kvm_memslot_delete(slots, memslot); - } else { - if (change == KVM_MR_CREATE) - i = kvm_memslot_insert_back(slots); - else - i = kvm_memslot_move_backward(slots, memslot); - i = kvm_memslot_move_forward(slots, memslot, i); + if (old) { + hash_del(&old->id_node[idx]); + interval_tree_remove(&old->hva_node[idx], &slots->hva_tree); - /* - * Copy the memslot to its new position in memslots and update - * its index accordingly. - */ - slots->memslots[i] = *memslot; - slots->id_to_index[memslot->id] = i; + if ((long)old == atomic_long_read(&slots->last_used_slot)) + atomic_long_set(&slots->last_used_slot, (long)new); + + if (!new) { + kvm_erase_gfn_node(slots, old); + return; + } + } + + /* + * Initialize @new's hva range. Do this even when replacing an @old + * slot, kvm_copy_memslot() deliberately does not touch node data. + */ + new->hva_node[idx].start = new->userspace_addr; + new->hva_node[idx].last = new->userspace_addr + + (new->npages << PAGE_SHIFT) - 1; + + /* + * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), + * hva_node needs to be swapped with remove+insert even though hva can't + * change when replacing an existing slot. + */ + hash_add(slots->id_hash, &new->id_node[idx], new->id); + interval_tree_insert(&new->hva_node[idx], &slots->hva_tree); + + /* + * If the memslot gfn is unchanged, rb_replace_node() can be used to + * switch the node in the gfn tree instead of removing the old and + * inserting the new as two separate operations. Replacement is a + * single O(1) operation versus two O(log(n)) operations for + * remove+insert. + */ + if (old && old->base_gfn == new->base_gfn) { + kvm_replace_gfn_node(slots, old, new); + } else { + if (old) + kvm_erase_gfn_node(slots, old); + kvm_insert_gfn_node(slots, new); } } @@ -1435,11 +1439,12 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m return 0; } -static struct kvm_memslots *install_new_memslots(struct kvm *kvm, - int as_id, struct kvm_memslots *slots) +static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { - struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); - u64 gen = old_memslots->generation; + struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); + + /* Grab the generation from the activate memslots. */ + u64 gen = __kvm_memslots(kvm, as_id)->generation; WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; @@ -1490,60 +1495,226 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, kvm_arch_memslots_updated(kvm, gen); slots->generation = gen; - - return old_memslots; } -static size_t kvm_memslots_size(int slots) +static int kvm_prepare_memory_region(struct kvm *kvm, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { - return sizeof(struct kvm_memslots) + - (sizeof(struct kvm_memory_slot) * slots); + int r; + + /* + * If dirty logging is disabled, nullify the bitmap; the old bitmap + * will be freed on "commit". If logging is enabled in both old and + * new, reuse the existing bitmap. If logging is enabled only in the + * new and KVM isn't using a ring buffer, allocate and initialize a + * new bitmap. + */ + if (change != KVM_MR_DELETE) { + if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + new->dirty_bitmap = NULL; + else if (old && old->dirty_bitmap) + new->dirty_bitmap = old->dirty_bitmap; + else if (!kvm->dirty_ring_size) { + r = kvm_alloc_dirty_bitmap(new); + if (r) + return r; + + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + bitmap_set(new->dirty_bitmap, 0, new->npages); + } + } + + r = kvm_arch_prepare_memory_region(kvm, old, new, change); + + /* Free the bitmap on failure if it was allocated above. */ + if (r && new && new->dirty_bitmap && old && !old->dirty_bitmap) + kvm_destroy_dirty_bitmap(new); + + return r; } -static void kvm_copy_memslots(struct kvm_memslots *to, - struct kvm_memslots *from) +static void kvm_commit_memory_region(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) { - memcpy(to, from, kvm_memslots_size(from->used_slots)); + /* + * Update the total number of memslot pages before calling the arch + * hook so that architectures can consume the result directly. + */ + if (change == KVM_MR_DELETE) + kvm->nr_memslot_pages -= old->npages; + else if (change == KVM_MR_CREATE) + kvm->nr_memslot_pages += new->npages; + + kvm_arch_commit_memory_region(kvm, old, new, change); + + switch (change) { + case KVM_MR_CREATE: + /* Nothing more to do. */ + break; + case KVM_MR_DELETE: + /* Free the old memslot and all its metadata. */ + kvm_free_memslot(kvm, old); + break; + case KVM_MR_MOVE: + case KVM_MR_FLAGS_ONLY: + /* + * Free the dirty bitmap as needed; the below check encompasses + * both the flags and whether a ring buffer is being used) + */ + if (old->dirty_bitmap && !new->dirty_bitmap) + kvm_destroy_dirty_bitmap(old); + + /* + * The final quirk. Free the detached, old slot, but only its + * memory, not any metadata. Metadata, including arch specific + * data, may be reused by @new. + */ + kfree(old); + break; + default: + BUG(); + } } /* - * Note, at a minimum, the current number of used slots must be allocated, even - * when deleting a memslot, as we need a complete duplicate of the memslots for - * use when invalidating a memslot prior to deleting/moving the memslot. + * Activate @new, which must be installed in the inactive slots by the caller, + * by swapping the active slots and then propagating @new to @old once @old is + * unreachable and can be safely modified. + * + * With NULL @old this simply adds @new to @active (while swapping the sets). + * With NULL @new this simply removes @old from @active and frees it + * (while also swapping the sets). */ -static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, - enum kvm_mr_change change) +static void kvm_activate_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) { - struct kvm_memslots *slots; - size_t new_size; + int as_id = kvm_memslots_get_as_id(old, new); - if (change == KVM_MR_CREATE) - new_size = kvm_memslots_size(old->used_slots + 1); - else - new_size = kvm_memslots_size(old->used_slots); + kvm_swap_active_memslots(kvm, as_id); - slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); - if (likely(slots)) - kvm_copy_memslots(slots, old); + /* Propagate the new memslot to the now inactive memslots. */ + kvm_replace_memslot(kvm, old, new); +} - return slots; +static void kvm_copy_memslot(struct kvm_memory_slot *dest, + const struct kvm_memory_slot *src) +{ + dest->base_gfn = src->base_gfn; + dest->npages = src->npages; + dest->dirty_bitmap = src->dirty_bitmap; + dest->arch = src->arch; + dest->userspace_addr = src->userspace_addr; + dest->flags = src->flags; + dest->id = src->id; + dest->as_id = src->as_id; +} + +static void kvm_invalidate_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Mark the current slot INVALID. As with all memslot modifications, + * this must be done on an unreachable slot to avoid modifying the + * current slot in the active tree. + */ + kvm_copy_memslot(invalid_slot, old); + invalid_slot->flags |= KVM_MEMSLOT_INVALID; + kvm_replace_memslot(kvm, old, invalid_slot); + + /* + * Activate the slot that is now marked INVALID, but don't propagate + * the slot to the now inactive slots. The slot is either going to be + * deleted or recreated as a new slot. + */ + kvm_swap_active_memslots(kvm, old->as_id); + + /* + * From this point no new shadow pages pointing to a deleted, or moved, + * memslot will be created. Validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_root) + */ + kvm_arch_flush_shadow_memslot(kvm, old); + + /* Was released by kvm_swap_active_memslots, reacquire. */ + mutex_lock(&kvm->slots_arch_lock); + + /* + * Copy the arch-specific field of the newly-installed slot back to the + * old slot as the arch data could have changed between releasing + * slots_arch_lock in install_new_memslots() and re-acquiring the lock + * above. Writers are required to retrieve memslots *after* acquiring + * slots_arch_lock, thus the active slot's data is guaranteed to be fresh. + */ + old->arch = invalid_slot->arch; +} + +static void kvm_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *new) +{ + /* Add the new memslot to the inactive set and activate. */ + kvm_replace_memslot(kvm, NULL, new); + kvm_activate_memslot(kvm, NULL, new); +} + +static void kvm_delete_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Remove the old memslot (in the inactive memslots) by passing NULL as + * the "new" slot, and for the invalid version in the active slots. + */ + kvm_replace_memslot(kvm, old, NULL); + kvm_activate_memslot(kvm, invalid_slot, NULL); +} + +static void kvm_move_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Replace the old memslot in the inactive slots, and then swap slots + * and replace the current INVALID with the new as well. + */ + kvm_replace_memslot(kvm, old, new); + kvm_activate_memslot(kvm, invalid_slot, new); +} + +static void kvm_update_flags_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + /* + * Similar to the MOVE case, but the slot doesn't need to be zapped as + * an intermediate step. Instead, the old memslot is simply replaced + * with a new, updated copy in both memslot sets. + */ + kvm_replace_memslot(kvm, old, new); + kvm_activate_memslot(kvm, old, new); } static int kvm_set_memslot(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot *new, int as_id, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change) { - struct kvm_memory_slot *slot, old; - struct kvm_memslots *slots; + struct kvm_memory_slot *invalid_slot; int r; /* - * Released in install_new_memslots. + * Released in kvm_swap_active_memslots. * * Must be held from before the current memslots are copied until * after the new memslots are installed with rcu_assign_pointer, - * then released before the synchronize srcu in install_new_memslots. + * then released before the synchronize srcu in kvm_swap_active_memslots. * * When modifying memslots outside of the slots_lock, must be held * before reading the pointer to the current memslots until after all @@ -1554,114 +1725,88 @@ static int kvm_set_memslot(struct kvm *kvm, */ mutex_lock(&kvm->slots_arch_lock); - slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); - if (!slots) { - mutex_unlock(&kvm->slots_arch_lock); - return -ENOMEM; - } - + /* + * Invalidate the old slot if it's being deleted or moved. This is + * done prior to actually deleting/moving the memslot to allow vCPUs to + * continue running by ensuring there are no mappings or shadow pages + * for the memslot when it is deleted/moved. Without pre-invalidation + * (and without a lock), a window would exist between effecting the + * delete/move and committing the changes in arch code where KVM or a + * guest could |
