From e5d83c74a5800c2a1fa3ba982c1c4b2b39ae6db2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 16 Feb 2017 10:40:56 +0100 Subject: kvm: make KVM_CAP_ENABLE_CAP_VM architecture agnostic The first such capability to be handled in virt/kvm/ will be manual dirty page reprotection. Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2679e476b6c3..1d6b77162d7c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2948,6 +2948,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: + case KVM_CAP_ENABLE_CAP_VM: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -2971,6 +2972,21 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); } +int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + return -EINVAL; +} + +static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + switch (cap->cap) { + default: + return kvm_vm_ioctl_enable_cap(kvm, cap); + } +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2984,6 +3000,15 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); + break; + } case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem; -- cgit v1.2.3 From 8fe65a8299f9e1f40cb95308ab7b3c4ad80bf801 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Oct 2018 02:18:42 +0200 Subject: kvm: rename last argument to kvm_get_dirty_log_protect When manual dirty log reprotect will be enabled, kvm_get_dirty_log_protect's pointer argument will always be false on exit, because no TLB flush is needed until the manual re-protection operation. Rename it from "is_dirty" to "flush", which more accurately tells the caller what they have to do with it. Signed-off-by: Paolo Bonzini --- virt/kvm/arm/arm.c | 6 +++--- virt/kvm/kvm_main.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 23774970c9df..120a2663dab9 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1205,14 +1205,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm); mutex_unlock(&kvm->slots_lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1d6b77162d7c..54f0fcfd431e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1154,7 +1154,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); * */ int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty) + struct kvm_dirty_log *log, bool *flush) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1181,7 +1181,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, memset(dirty_bitmap_buffer, 0, n); spin_lock(&kvm->mmu_lock); - *is_dirty = false; + *flush = false; for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1189,7 +1189,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, if (!dirty_bitmap[i]) continue; - *is_dirty = true; + *flush = true; mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; -- cgit v1.2.3 From 2a31b9db153530df4aa02dac8c32837bf5f47019 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Oct 2018 02:36:47 +0200 Subject: kvm: introduce manual dirty log reprotect There are two problems with KVM_GET_DIRTY_LOG. First, and less important, it can take kvm->mmu_lock for an extended period of time. Second, its user can actually see many false positives in some cases. The latter is due to a benign race like this: 1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects them. 2. The guest modifies the pages, causing them to be marked ditry. 3. Userspace actually copies the pages. 4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though they were not written to since (3). This is especially a problem for large guests, where the time between (1) and (3) can be substantial. This patch introduces a new capability which, when enabled, makes KVM_GET_DIRTY_LOG not write-protect the pages it returns. Instead, userspace has to explicitly clear the dirty log bits just before using the content of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a 64-page granularity rather than requiring to sync a full memslot; this way, the mmu_lock is taken for small amounts of time, and only a small amount of time will pass between write protection of pages and the sending of their content. Signed-off-by: Paolo Bonzini --- virt/kvm/arm/arm.c | 16 +++++++ virt/kvm/kvm_main.c | 132 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 131 insertions(+), 17 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 120a2663dab9..e91adf77d99a 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1219,6 +1219,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; } +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 54f0fcfd431e..0041947b7390 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages - * are dirty write protect them for next write. + * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: flag set if any page is dirty @@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); + *flush = false; + if (kvm->manual_dirty_log_protect) { + /* + * Unlike kvm_get_dirty_log, we always return false in *flush, + * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There + * is some code duplication between this function and + * kvm_get_dirty_log, but hopefully all architecture + * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log + * can be eliminated. + */ + dirty_bitmap_buffer = dirty_bitmap; + } else { + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); - memset(dirty_bitmap_buffer, 0, n); + spin_lock(&kvm->mmu_lock); + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - spin_lock(&kvm->mmu_lock); + if (!dirty_bitmap[i]) + continue; + + *flush = true; + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + if (mask) { + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); + } + } + spin_unlock(&kvm->mmu_lock); + } + + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); + +/** + * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap + * and reenable dirty page tracking for the corresponding pages. + * @kvm: pointer to kvm instance + * @log: slot id and address from which to fetch the bitmap of dirty pages + */ +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int as_id, id, n; + gfn_t offset; + unsigned long i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return -EINVAL; + + if ((log->first_page & 63) || (log->num_pages & 63)) + return -EINVAL; + + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); + + dirty_bitmap = memslot->dirty_bitmap; + if (!dirty_bitmap) + return -ENOENT; + + n = kvm_dirty_bitmap_bytes(memslot); *flush = false; - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) + return -EFAULT; - if (!dirty_bitmap[i]) + spin_lock(&kvm->mmu_lock); + for (offset = log->first_page, + i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + i++, offset += BITS_PER_LONG) { + unsigned long mask = *dirty_bitmap_buffer++; + atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; + if (!mask) continue; - *flush = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; + mask &= atomic_long_fetch_andnot(mask, p); + /* + * mask contains the bits that really have been cleared. This + * never includes any bits beyond the length of the memslot (if + * the length is not aligned to 64 pages), therefore it is not + * a problem if userspace sets them in log->dirty_bitmap. + */ if (mask) { - offset = i * BITS_PER_LONG; + *flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } - spin_unlock(&kvm->mmu_lock); - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - return -EFAULT; + return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); +EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); #endif bool kvm_largepages_enabled(void) @@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: +#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + if (cap->flags || (cap->args[0] & ~1)) + return -EINVAL; + kvm->manual_dirty_log_protect = cap->args[0]; + return 0; +#endif default: return kvm_vm_ioctl_enable_cap(kvm, cap); } @@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CLEAR_DIRTY_LOG: { + struct kvm_clear_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof(log))) + goto out; + r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); + break; + } +#endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; -- cgit v1.2.3 From 0d640732dbebed0f10f18526de21652931f0b2f2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Nov 2018 15:07:10 +0000 Subject: arm64: KVM: Skip MMIO insn after emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we emulate an MMIO instruction, we advance the CPU state within decode_hsr(), before emulating the instruction effects. Having this logic in decode_hsr() is opaque, and advancing the state before emulation is problematic. It gets in the way of applying consistent single-step logic, and it prevents us from being able to fail an MMIO instruction with a synchronous exception. Clean this up by only advancing the CPU state *after* the effects of the instruction are emulated. Cc: Peter Maydell Reviewed-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmio.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index dac7ceb1a677..08443a15e6be 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); } + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 0; } @@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; - /* - * The MMIO instruction is emulated and should not be re-executed - * in the guest. - */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); return 0; } -- cgit v1.2.3 From bd7d95cafb499e24903b7d21f9eeb2c5208160c2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Nov 2018 15:07:11 +0000 Subject: arm64: KVM: Consistently advance singlestep when emulating instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we emulate a guest instruction, we don't advance the hardware singlestep state machine, and thus the guest will receive a software step exception after a next instruction which is not emulated by the host. We bodge around this in an ad-hoc fashion. Sometimes we explicitly check whether userspace requested a single step, and fake a debug exception from within the kernel. Other times, we advance the HW singlestep state rely on the HW to generate the exception for us. Thus, the observed step behaviour differs for host and guest. Let's make this simpler and consistent by always advancing the HW singlestep state machine when we skip an instruction. Thus we can rely on the hardware to generate the singlestep exception for us, and never need to explicitly check for an active-pending step, nor do we need to fake a debug exception from the guest. Cc: Peter Maydell Reviewed-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- virt/kvm/arm/arm.c | 2 -- virt/kvm/arm/hyp/vgic-v3-sr.c | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 23774970c9df..4adcee5fc126 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -674,8 +674,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_handle_mmio_return(vcpu, vcpu->run); if (ret) return ret; - if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) - return 0; } if (run->immediate_exit) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 616e5a433ab0..9652c453480f 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) esr = kvm_vcpu_get_hsr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { - if (!kvm_condition_valid(vcpu)) + if (!kvm_condition_valid(vcpu)) { + __kvm_skip_instr(vcpu); return 1; + } sysreg = esr_cp15_to_sysreg(esr); } else { @@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) rt = kvm_vcpu_sys_get_rt(vcpu); fn(vcpu, vmcr, rt); + __kvm_skip_instr(vcpu); + return 1; } -- cgit v1.2.3 From fb544d1ca65a89f7a3895f7531221ceeed74ada7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 13:23:57 +0100 Subject: KVM: arm/arm64: Fix VMID alloc race by reverting to lock-less We recently addressed a VMID generation race by introducing a read/write lock around accesses and updates to the vmid generation values. However, kvm_arch_vcpu_ioctl_run() also calls need_new_vmid_gen() but does so without taking the read lock. As far as I can tell, this can lead to the same kind of race: VM 0, VCPU 0 VM 0, VCPU 1 ------------ ------------ update_vttbr (vmid 254) update_vttbr (vmid 1) // roll over read_lock(kvm_vmid_lock); force_vm_exit() local_irq_disable need_new_vmid_gen == false //because vmid gen matches enter_guest (vmid 254) kvm_arch.vttbr = : read_unlock(kvm_vmid_lock); enter_guest (vmid 1) Which results in running two VCPUs in the same VM with different VMIDs and (even worse) other VCPUs from other VMs could now allocate clashing VMID 254 from the new generation as long as VCPU 0 is not exiting. Attempt to solve this by making sure vttbr is updated before another CPU can observe the updated VMID generation. Cc: stable@vger.kernel.org Fixes: f0cf47d939d0 "KVM: arm/arm64: Close VMID generation race" Reviewed-by: Julien Thierry Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/arm.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4adcee5fc126..d9273f972828 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -66,7 +66,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; static unsigned int kvm_vmid_bits __read_mostly; -static DEFINE_RWLOCK(kvm_vmid_lock); +static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; @@ -484,7 +484,9 @@ void force_vm_exit(const cpumask_t *mask) */ static bool need_new_vmid_gen(struct kvm *kvm) { - return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); + u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); + smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ + return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); } /** @@ -499,16 +501,11 @@ static void update_vttbr(struct kvm *kvm) { phys_addr_t pgd_phys; u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; - bool new_gen; - read_lock(&kvm_vmid_lock); - new_gen = need_new_vmid_gen(kvm); - read_unlock(&kvm_vmid_lock); - - if (!new_gen) + if (!need_new_vmid_gen(kvm)) return; - write_lock(&kvm_vmid_lock); + spin_lock(&kvm_vmid_lock); /* * We need to re-check the vmid_gen here to ensure that if another vcpu @@ -516,7 +513,7 @@ static void update_vttbr(struct kvm *kvm) * use the same vmid. */ if (!need_new_vmid_gen(kvm)) { - write_unlock(&kvm_vmid_lock); + spin_unlock(&kvm_vmid_lock); return; } @@ -539,7 +536,6 @@ static void update_vttbr(struct kvm *kvm) kvm_call_hyp(__kvm_flush_vm_context); } - kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen); kvm->arch.vmid = kvm_next_vmid; kvm_next_vmid++; kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; @@ -550,7 +546,10 @@ static void update_vttbr(struct kvm *kvm) vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; - write_unlock(&kvm_vmid_lock); + smp_wmb(); + WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); + + spin_unlock(&kvm_vmid_lock); } static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 60c3ab30d8c2ff3a52606df03f05af2aae07dc6b Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 12:51:03 +0100 Subject: KVM: arm/arm64: vgic-v2: Set active_source to 0 when restoring state When restoring the active state from userspace, we don't know which CPU was the source for the active state, and this is not architecturally exposed in any of the register state. Set the active_source to 0 in this case. In the future, we can expand on this and exposse the information as additional information to userspace for GICv2 if anyone cares. Cc: stable@vger.kernel.org Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index f56ff1cf52ec..2b450d49a046 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -338,11 +338,26 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else { u32 model = vcpu->kvm->arch.vgic.vgic_model; + u8 active_source; irq->active = active; + + /* + * The GICv2 architecture indicates that the source CPUID for + * an SGI should be provided during an EOI which implies that + * the active state is stored somewhere, but at the same time + * this state is not architecturally exposed anywhere and we + * have no way of knowing the right source. + * + * This may lead to a VCPU not being able to receive + * additional instances of a particular SGI after migration + * for a GICv2 VM on some GIC implementations. Oh well. + */ + active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; + if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && active && vgic_irq_is_sgi(irq->intid)) - irq->active_source = requester_vcpu->vcpu_id; + irq->active_source = active_source; } if (irq->active) -- cgit v1.2.3 From 3f58bf63455588a260b6abc8761c48bc8977b919 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:34 +0000 Subject: KVM: arm/arm64: Share common code in user_mem_abort() The code for operations such as marking the pfn as dirty, and dcache/icache maintenance during stage 2 fault handling is duplicated between normal pages and PMD hugepages. Instead of creating another copy of the operations when we introduce PUD hugepages, let's share them across the different pagesizes. Signed-off-by: Punit Agrawal Reviewed-by: Suzuki K Poulose Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 5eca48bdb1a6..59595207c5e1 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1475,7 +1475,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long fault_status) { int ret; - bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; + bool write_fault, exec_fault, writable, force_pte = false; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1484,7 +1484,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); - unsigned long flags = 0; + unsigned long vma_pagesize, flags = 0; write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_iabt(vcpu); @@ -1504,10 +1504,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { - hugetlb = true; + vma_pagesize = vma_kernel_pagesize(vma); + if (vma_pagesize == PMD_SIZE && !logging_active) { gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { + /* + * Fallback to PTE if it's not one of the Stage 2 + * supported hugepage sizes + */ + vma_pagesize = PAGE_SIZE; + /* * Pages belonging to memslots that don't have the same * alignment for userspace and IPA cannot be mapped using @@ -1573,23 +1579,33 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb && !force_pte) - hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !force_pte) { + /* + * Only PMD_SIZE transparent hugepages(THP) are + * currently supported. This code will need to be + * updated to support other THP sizes. + */ + if (transparent_hugepage_adjust(&pfn, &fault_ipa)) + vma_pagesize = PMD_SIZE; + } + + if (writable) + kvm_set_pfn_dirty(pfn); - if (hugetlb) { + if (fault_status != FSC_PERM) + clean_dcache_guest_page(pfn, vma_pagesize); + + if (exec_fault) + invalidate_icache_guest_page(pfn, vma_pagesize); + + if (vma_pagesize == PMD_SIZE) { pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); - if (writable) { + if (writable) new_pmd = kvm_s2pmd_mkwrite(new_pmd); - kvm_set_pfn_dirty(pfn); - } - - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PMD_SIZE); if (exec_fault) { new_pmd = kvm_s2pmd_mkexec(new_pmd); - invalidate_icache_guest_page(pfn, PMD_SIZE); } else if (fault_status == FSC_PERM) { /* Preserve execute if XN was already cleared */ if (stage2_is_exec(kvm, fault_ipa)) @@ -1602,16 +1618,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (writable) { new_pte = kvm_s2pte_mkwrite(new_pte); - kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PAGE_SIZE); - if (exec_fault) { new_pte = kvm_s2pte_mkexec(new_pte); - invalidate_icache_guest_page(pfn, PAGE_SIZE); } else if (fault_status == FSC_PERM) { /* Preserve execute if XN was already cleared */ if (stage2_is_exec(kvm, fault_ipa)) -- cgit v1.2.3 From 6396b852e46e562f4742ed0a9042b537eb26b8aa Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:35 +0000 Subject: KVM: arm/arm64: Re-factor setting the Stage 2 entry to exec on fault Stage 2 fault handler marks a page as executable if it is handling an execution fault or if it was a permission fault in which case the executable bit needs to be preserved. The logic to decide if the page should be marked executable is duplicated for PMD and PTE entries. To avoid creating another copy when support for PUD hugepages is introduced refactor the code to share the checks needed to mark a page table entry as executable. Signed-off-by: Punit Agrawal Reviewed-by: Suzuki K Poulose Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 59595207c5e1..6912529946fb 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1475,7 +1475,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long fault_status) { int ret; - bool write_fault, exec_fault, writable, force_pte = false; + bool write_fault, writable, force_pte = false; + bool exec_fault, needs_exec; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1598,19 +1599,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) invalidate_icache_guest_page(pfn, vma_pagesize); + /* + * If we took an execution fault we have made the + * icache/dcache coherent above and should now let the s2 + * mapping be executable. + * + * Write faults (!exec_fault && FSC_PERM) are orthogonal to + * execute permissions, and we preserve whatever we have. + */ + needs_exec = exec_fault || + (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); + if (vma_pagesize == PMD_SIZE) { pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); if (writable) new_pmd = kvm_s2pmd_mkwrite(new_pmd); - if (exec_fault) { + if (needs_exec) new_pmd = kvm_s2pmd_mkexec(new_pmd); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pmd = kvm_s2pmd_mkexec(new_pmd); - } ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { @@ -1621,13 +1628,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mark_page_dirty(kvm, gfn); } - if (exec_fault) { + if (needs_exec) new_pte = kvm_s2pte_mkexec(new_pte); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pte = kvm_s2pte_mkexec(new_pte); - } ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } -- cgit v1.2.3 From f8df73388ee25b5e5f1d26249202e7126ca8139d Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:36 +0000 Subject: KVM: arm/arm64: Introduce helpers to manipulate page table entries Introduce helpers to abstract architectural handling of the conversion of pfn to page table entries and marking a PMD page table entry as a block entry. The helpers are introduced in preparation for supporting PUD hugepages at stage 2 - which are supported on arm64 but do not exist on arm. Signed-off-by: Punit Agrawal Reviewed-by: Suzuki K Poulose Acked-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 6912529946fb..fb5325f7a1ac 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -607,7 +607,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, addr = start; do { pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, pfn_pte(pfn, prot)); + kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); get_page(virt_to_page(pte)); pfn++; } while (addr += PAGE_SIZE, addr != end); @@ -1202,7 +1202,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pfn = __phys_to_pfn(pa); for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); + pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) pte = kvm_s2pte_mkwrite(pte); @@ -1611,8 +1611,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); if (vma_pagesize == PMD_SIZE) { - pmd_t new_pmd = pfn_pmd(pfn, mem_type); - new_pmd = pmd_mkhuge(new_pmd); + pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); + + new_pmd = kvm_pmd_mkhuge(new_pmd); + if (writable) new_pmd = kvm_s2pmd_mkwrite(new_pmd); @@ -1621,7 +1623,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { - pte_t new_pte = pfn_pte(pfn, mem_type); + pte_t new_pte = kvm_pfn_pte(pfn, mem_type); if (writable) { new_pte = kvm_s2pte_mkwrite(new_pte); @@ -1878,7 +1880,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = pfn_pte(pfn, PAGE_S2); + stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } -- cgit v1.2.3 From 4ea5af53114091e23a8fc279f25637e6c4e892c6 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:37 +0000 Subject: KVM: arm64: Support dirty page tracking for PUD hugepages In preparation for creating PUD hugepages at stage 2, add support for write protecting PUD hugepages when they are encountered. Write protecting guest tables is used to track dirty pages when migrating VMs. Also, provide trivial implementations of required kvm_s2pud_* helpers to allow sharing of code with arm32. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON() in arm32 pud helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fb5325f7a1ac..1c669c3c1208 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1347,9 +1347,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { - /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_wp_pmds(kvm, pud, addr, next); + if (stage2_pud_huge(kvm, *pud)) { + if (!kvm_s2pud_readonly(pud)) + kvm_set_s2pud_readonly(pud); + } else { + stage2_wp_pmds(kvm, pud, addr, next); + } } } while (pud++, addr = next, addr != end); } @@ -1392,7 +1395,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) * * Called to start logging dirty pages after memory region * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns - * all present PMD and PTEs are write protected in the memory region. + * all present PUD, PMD and PTEs are write protected in the memory region. * Afterwards read of dirty page log can be called. * * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, -- cgit v1.2.3 From 86d1c55ea605025f78d026e7fc3a2bb4c3fc2d6a Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:38 +0000 Subject: KVM: arm64: Support PUD hugepage in stage2_is_exec() In preparation for creating PUD hugepages at stage 2, add support for detecting execute permissions on PUD page table entries. Faults due to lack of execute permissions on page table entries is used to perform i-cache invalidation on first execute. Provide trivial implementations of arm32 helpers to allow sharing of code. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON(1) in arm32 PUD helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 1c669c3c1208..8e44dccd1b47 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1083,23 +1083,66 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +/* + * stage2_get_leaf_entry - walk the stage2 VM page tables and return + * true if a valid and present leaf-entry is found. A pointer to the + * leaf-entry is returned in the appropriate level variable - pudpp, + * pmdpp, ptepp. + */ +static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, + pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) { + pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - pmdp = stage2_get_pmd(kvm, NULL, addr); + *pudpp = NULL; + *pmdpp = NULL; + *ptepp = NULL; + + pudp = stage2_get_pud(kvm, NULL, addr); + if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) + return false; + + if (stage2_pud_huge(kvm, *pudp)) { + *pudpp = pudp; + return true; + } + + pmdp = stage2_pmd_offset(kvm, pudp, addr); if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) return false; - if (pmd_thp_or_huge(*pmdp)) - return kvm_s2pmd_exec(pmdp); + if (pmd_thp_or_huge(*pmdp)) { + *pmdpp = pmdp; + return true; + } ptep = pte_offset_kernel(pmdp, addr); if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) return false; - return kvm_s2pte_exec(ptep); + *ptepp = ptep; + return true; +} + +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + bool found; + + found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); + if (!found) + return false; + + if (pudp) + return kvm_s2pud_exec(pudp); + else if (pmdp) + return kvm_s2pmd_exec(pmdp); + else + return kvm_s2pte_exec(ptep); } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, -- cgit v1.2.3 From eb3f0624ea082def887acc79e97934e27d0188b7 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:39 +0000 Subject: KVM: arm64: Support handling access faults for PUD hugepages In preparation for creating larger hugepages at Stage 2, extend the access fault handling at Stage 2 to support PUD hugepages when encountered. Provide trivial helpers for arm32 to allow sharing of code. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON(1) in PUD helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 8e44dccd1b47..bd749601195f 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1698,6 +1698,7 @@ out_unlock: */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + pud_t *pud; pmd_t *pmd; pte_t *pte; kvm_pfn_t pfn; @@ -1707,24 +1708,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) spin_lock(&vcpu->kvm->mmu_lock); - pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) goto out; - if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ + if (pud) { /* HugeTLB */ + *pud = kvm_s2pud_mkyoung(*pud); + pfn = kvm_pud_pfn(*pud); + pfn_valid = true; + } else if (pmd) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; - goto out; + } else { + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; } - pte = pte_offset_kernel(pmd, fault_ipa); - if (pte_none(*pte)) /* Nothing there either */ - goto out; - - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; out: spin_unlock(&vcpu->kvm->mmu_lock); if (pfn_valid) -- cgit v1.2.3 From 35a63966194dd994f44150f07398c62f8dca011e Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:40 +0000 Subject: KVM: arm64: Update age handlers to support PUD hugepages In preparation for creating larger hugepages at Stage 2, add support to the age handling notifiers for PUD hugepages when encountered. Provide trivial helpers for arm32 to allow sharing code. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON(1) for arm32 PUD helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index bd749601195f..3893ea6a50bf 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1225,6 +1225,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) return stage2_ptep_test_and_clear_young((pte_t *)pmd); } +static int stage2_pudp_test_and_clear_young(pud_t *pud) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pud); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -1932,42 +1937,38 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return stage2_pudp_test_and_clear_young(pud); + else if (pmd) return stage2_pmdp_test_and_clear_young(pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (pte_none(*pte)) - return 0; - - return stage2_ptep_test_and_clear_young(pte); + else + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return kvm_s2pud_young(*pud); + else if (pmd) return pmd_young(*pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (!pte_none(*pte)) /* Just a page... */ + else return pte_young(*pte); - - return 0; } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -- cgit v1.2.3 From b8e0ba7c8bea994011aff3b4c35256b180fab874 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:41 +0000 Subject: KVM: arm64: Add support for creating PUD hugepages at stage 2 KVM only supports PMD hugepages at stage 2. Now that the various page handling routines are updated, extend the stage 2 fault handling to map in PUD hugepages. Addition of PUD hugepage support enables additional page sizes (e.g., 1G with 4K granule) which can be useful on cores that support mapping larger block sizes in the TLB entries. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replace BUG() => WARN_ON(1) for arm32 PUD helpers ] Signed-off-by: Suzuki Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 3893ea6a50bf..2dcff38868d4 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -115,6 +115,25 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) put_page(virt_to_page(pmd)); } +/** + * stage2_dissolve_pud() - clear and flush huge PUD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pud: pud pointer for IPA + * + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all + * pages in the range dirty. + */ +static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) +{ + if (!stage2_pud_huge(kvm, *pudp)) + return; + + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pudp)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -1022,7 +1041,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (!pud) + if (!pud || stage2_pud_huge(kvm, *pud)) return NULL; if (stage2_pud_none(kvm, *pud)) { @@ -1083,6 +1102,36 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } +static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pud_t *new_pudp) +{ + pud_t *pudp, old_pud; + + pudp = stage2_get_pud(kvm, cache, addr); + VM_BUG_ON(!pudp); + + old_pud = *pudp; + + /* + * A large number of vcpus faulting on the same stage 2 entry, + * can lead to a refault due to the + * stage2_pud_clear()/tlb_flush(). Skip updating the page + * tables if there is no change. + */ + if (pud_val(old_pud) == pud_val(*new_pudp)) + return 0; + + if (stage2_pud_present(kvm, old_pud)) { + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pudp)); + } + + kvm_set_pud(pudp, *new_pudp); + return 0; +} + /* * stage2_get_leaf_entry - walk the stage2 VM page tables and return * true if a valid and present leaf-entry is found. A pointer to the @@ -1149,6 +1198,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pte_t *new_pte, unsigned long flags) { + pud_t *pud; pmd_t *pmd; pte_t *pte, old_pte; bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; @@ -1157,7 +1207,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, VM_BUG_ON(logging_active && !cache); /* Create stage-2 page table mapping - Levels 0 and 1 */ - pmd = stage2_get_pmd(kvm, cache, addr); + pud = stage2_get_pud(kvm, cache, addr); + if (!pud) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PUD, then continue + * on to allocate page. + */ + if (logging_active) + stage2_dissolve_pud(kvm, addr, pud); + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + pmd = stage2_pmd_offset(kvm, pud, addr); if (!pmd) { /* * Ignore calls from kvm_set_spte_hva for unallocated @@ -1557,12 +1631,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = vma_kernel_pagesize(vma); - if (vma_pagesize == PMD_SIZE && !logging_active) { - gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; + /* + * PUD level may not exist for a VM but PMD is guaranteed to + * exist. + */ + if ((vma_pagesize == PMD_SIZE || + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) && + !logging_active) { + gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; } else { /* * Fallback to PTE if it's not one of the Stage 2 - * supported hugepage sizes + * supported hugepage sizes or the corresponding level + * doesn't exist */ vma_pagesize = PAGE_SIZE; @@ -1661,7 +1742,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, needs_exec = exec_fault || (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); - if (vma_pagesize == PMD_SIZE) { + if (vma_pagesize == PUD_SIZE) { + pud_t new_pud = kvm_pfn_pud(pfn, mem_type); + + new_pud = kvm_pud_mkhuge(new_pud); + if (writable) + new_pud = kvm_s2pud_mkwrite(new_pud); + + if (needs_exec) + new_pud = kvm_s2pud_mkexec(new_pud); + + ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); + } else if (vma_pagesize == PMD_SIZE) { pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); new_pmd = kvm_pmd_mkhuge(new_pmd); -- cgit v1.2.3 From 2e2f6c3c0b08eed3fcf7de3c7684c940451bdeb1 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Mon, 26 Nov 2018 18:26:44 +0000 Subject: KVM: arm/arm64: vgic: Do not cond_resched_lock() with IRQs disabled To change the active state of an MMIO, halt is requested for all vcpus of the affected guest before modifying the IRQ state. This is done by calling cond_resched_lock() in vgic_mmio_change_active(). However interrupts are disabled at this point and we cannot reschedule a vcpu. We actually don't need any of this, as kvm_arm_halt_guest ensures that all the other vcpus are out of the guest. Let's just drop that useless code. Signed-off-by: Julien Thierry Suggested-by: Christoffer Dall Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 2b450d49a046..7c2231950c33 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -313,27 +313,6 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, spin_lock_irqsave(&irq->irq_lock, flags); - /* - * If this virtual IRQ was written into a list register, we - * have to make sure the CPU that runs the VCPU thread has - * synced back the LR state to the struct vgic_irq. - * - * As long as the conditions below are true, we know the VCPU thread - * may be on its way back from the guest (we kicked the VCPU thread in - * vgic_change_active_prepare) and still has to sync back this IRQ, - * so we release and re-acquire the spin_lock to let the other thread - * sync back the IRQ. - * - * When accessing VGIC state from user space, requester_vcpu is - * NULL, which is fine, because we guarantee that no VCPUs are running - * when accessing VGIC state from user space so irq->vcpu->cpu is - * always -1. - */ - while (irq->vcpu && /* IRQ may have state in an LR somewhere */ - irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ - irq->vcpu->cpu != -1) /* VCPU thread is running */ - cond_resched_lock(&irq->irq_lock); - if (irq->hw) { vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else { -- cgit v1.2.3 From 6992195cc6c6c4d673a3266ae59cbeeb746d61af Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 6 Nov 2018 13:33:38 +0100 Subject: KVM: arm64: Clarify explanation of STAGE2_PGTABLE_LEVELS In attempting to re-construct the logic for our stage 2 page table layout I found the reasoning in the comment explaining how we calculate the number of levels used for stage 2 page tables a bit backwards. This commit attempts to clarify the comment, to make it slightly easier to read without having the Arm ARM open on the right page. While we're at it, fixup a typo in a comment that was recently changed. Reviewed-by: Suzuki K Poulose Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 2dcff38868d4..f605514395a1 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1356,7 +1356,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) struct page *page = pfn_to_page(pfn); /* - * PageTransCompoungMap() returns true for THP and + * PageTransCompoundMap() returns true for THP and * hugetlbfs. Make sure the adjustment is done only for THP * pages. */ -- cgit v1.2.3 From bea2ef803ade3359026d5d357348842bca9edcf1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Dec 2018 17:11:19 +0000 Subject: KVM: arm/arm64: vgic: Cap SPIs to the VM-defined maximum SPIs should be checked against the VMs specific configuration, and not the architectural maximum. Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 7cfdfbc910e0..8ab0491bcc94 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -108,8 +108,8 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, } /* SPIs */ - if (intid <= VGIC_MAX_SPI) { - intid = array_index_nospec(intid, VGIC_MAX_SPI); + if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; } -- cgit v1.2.3 From c23b2e6fc4ca346018618266bcabd335c0a8a49e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 12 Dec 2018 14:11:23 -0600 Subject: KVM: arm/arm64: vgic: Fix off-by-one bug in vgic_get_irq() When using the nospec API, it should be taken into account that: "...if the CPU speculates past the bounds check then * array_index_nospec() will clamp the index within the range of [0, * size)." The above is part of the header for macro array_index_nospec() in linux/nospec.h Now, in this particular case, if intid evaluates to exactly VGIC_MAX_SPI or to exaclty VGIC_MAX_PRIVATE, the array_index_nospec() macro ends up returning VGIC_MAX_SPI - 1 or VGIC_MAX_PRIVATE - 1 respectively, instead of VGIC_MAX_SPI or VGIC_MAX_PRIVATE, which, based on the original logic: /* SGIs and PPIs */ if (intid <= VGIC_MAX_PRIVATE) return &vcpu->arch.vgic_cpu.private_irqs[intid]; /* SPIs */ if (intid <= VGIC_MAX_SPI) return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; are valid values for intid. Fix this by calling array_index_nospec() macro with VGIC_MAX_PRIVATE + 1 and VGIC_MAX_SPI + 1 as arguments for its parameter size. Fixes: 41b87599c743 ("KVM: arm/arm64: vgic: fix possible spectre-v1 in vgic_get_irq()") Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva [dropped the SPI part which was fixed separately] Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 8ab0491bcc94..f884a54b2601 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -103,7 +103,7 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, { /* SGIs and PPIs */ if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE); + intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); return &vcpu->arch.vgic_cpu.private_irqs[intid]; } -- cgit v1.2.3 From 9009782a4937ad0f4a4be6945080d7fa77fa4092 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 1 Dec 2018 13:21:47 -0800 Subject: KVM: arm/arm64: vgic: Consider priority and active state for pending irq When checking if there are any pending IRQs for the VM, consider the active state and priority of the IRQs as well. Otherwise we could be continuously scheduling a guest hypervisor without it seeing an IRQ. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index f884a54b2601..a6b135491b6c 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -908,6 +908,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) struct vgic_irq *irq; bool pending = false; unsigned long flags; + struct vgic_vmcr vmcr; if (!vcpu->kvm->arch.vgic.enabled) return false; @@ -915,11 +916,15 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) return true; + vgic_get_vmcr(vcpu, &vmcr); + spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); - pending = irq_is_pending(irq) && irq->enabled; + pending = irq_is_pending(irq) && irq->enabled && + !irq->active && + irq->priority < vmcr.pmr; spin_unlock(&irq->irq_lock); if (pending) -- cgit v1.2.3 From 71a7e47f39a2fb971ef9934e98ba089581eff641 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 3 Dec 2018 21:31:24 +0100 Subject: KVM: arm/arm64: Fixup the kvm_exit tracepoint The kvm_exit tracepoint strangely always reported exits as being IRQs. This seems to be because either the __print_symbolic or the tracepoint macros use a variable named idx. Take this chance to update the fields in the tracepoint to reflect the concepts in the arm64 architecture that we pass to the tracepoint and move the exception type table to the same location and header files as the exits code. We also clear out the exception code to 0 for IRQ exits (which translates to UNKNOWN in text) to make it slighyly less confusing to parse the trace output. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/trace.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 57b3edebbb40..f21f04f8036d 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -26,25 +26,25 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), - TP_ARGS(idx, exit_reason, vcpu_pc), + TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), + TP_ARGS(ret, esr_ec, vcpu_pc), TP_STRUCT__entry( - __field( int, idx ) - __field( unsigned int, exit_reason ) + __field( int, ret ) + __field( unsigned int, esr_ec ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( - __entry->idx = idx; - __entry->exit_reason = exit_reason; + __entry->ret = ARM_EXCEPTION_CODE(ret); + __entry->esr_ec = (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_TRAP) ? esr_ec : 0; __entry->vcpu_pc = vcpu_pc; ), TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", - __print_symbolic(__entry->idx, kvm_arm_exception_type), - __entry->exit_reason, - __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), + __print_symbolic(__entry->ret, kvm_arm_exception_type), + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), __entry->vcpu_pc) ); -- cgit v1.2.3 From 8a411b060f820140e9894239fde5819bb213bef0 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 27 Nov 2018 13:48:08 +0100 Subject: KVM: arm/arm64: Remove arch timer workqueue The use of a work queue in the hrtimer expire function for the bg_timer is a leftover from the time when we would inject interrupts when the bg_timer expired. Since we are no longer doing that, we can instead call kvm_vcpu_wake_up() directly from the hrtimer function and remove all workqueue functionality from the arch timer code. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 17cecc96f735..da261f5e2a91 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -70,11 +70,9 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns) HRTIMER_MODE_ABS); } -static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) +static void soft_timer_cancel(struct hrtimer *hrt) { hrtimer_cancel(hrt); - if (work) - cancel_work_sync(work); } static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) @@ -102,23 +100,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * Work function for handling the backup timer that we schedule when a vcpu is - * no longer running, but had a timer programmed to fire in the future. - */ -static void kvm_timer_inject_irq_work(struct work_struct *work) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); - - /* - * If the vcpu is blocked we want to wake it up so that it will see - * the timer has expired when entering the guest. - */ - kvm_vcpu_wake_up(vcpu); -} - static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) { u64 cval, now; @@ -188,7 +169,7 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) return HRTIMER_RESTART; } - schedule_work(&timer->expired); + kvm_vcpu_wake_up(vcpu); return HRTIMER_NORESTART; } @@ -300,7 +281,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) * then we also don't need a soft timer. */ if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->phys_timer); return; } @@ -426,7 +407,7 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu) vtimer_restore_state(vcpu); - soft_timer_cancel(&timer->bg_timer, &timer->expired); + soft_timer_cancel(&timer->bg_timer); } static void set_cntvoff(u64 cntvoff) @@ -544,7 +525,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->phys_timer); /* * The kernel may decide to run userspace after calling vcpu_put, so @@ -637,7 +618,6 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); vcpu_ptimer(vcpu)->cntvoff = 0; - INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; @@ -794,8 +774,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - soft_timer_cancel(&timer->bg_timer, &timer->expired); - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->bg_timer); + soft_timer_cancel(&timer->phys_timer); kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); } -- cgit v1.2.3 From 6e14ef1d12dc26ab4f6bef9d47e6906002f0a1ba Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 09:54:11 +0100 Subject: KVM: arm/arm64: arch_timer: Simplify kvm_timer_vcpu_terminate kvm_timer_vcpu_terminate can only be called in two scenarios: 1. As part of cleanup during a failed VCPU create 2. As part of freeing the whole VM (struct kvm refcount == 0) In the first case, we cannot have programmed any timers or mapped any IRQs, and therefore we do not have to cancel anything or unmap anything. In the second case, the VCPU will have gone through kvm_timer_vcpu_put, which will have canceled the emulated physical timer's hrtimer, and we do not need to that here as well. We also do not care if the irq is recorded as mapped or not in the VGIC data structure, because the whole VM is going away. That leaves us only with having to ensure that we cancel the bg_timer if we were blocking the last time we called kvm_timer_vcpu_put(). Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index da261f5e2a91..b07ac4614e1c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -772,11 +772,8 @@ out_free_irq: void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); soft_timer_cancel(&timer->bg_timer); - soft_timer_cancel(&timer->phys_timer); - kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); } static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 107352a24900fb458152b92a4e72fbdc83fd5510 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 18 Dec 2018 14:59:09 +0000 Subject: arm/arm64: KVM: vgic: Force VM halt when changing the active state of GICv3 PPIs/SGIs We currently only halt the guest when a vCPU messes with the active state of an SPI. This is perfectly fine for GICv2, but isn't enough for GICv3, where all vCPUs can access the state of any other vCPU. Let's broaden the condition to include any GICv3 interrupt that has an active state (i.e. all but LPIs). Cc: stable@vger.kernel.org Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 7c2231950c33..ceeda7e04a4d 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -362,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, */ static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } /* See vgic_change_active_prepare */ static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } -- cgit v1.2.3 From 6794ad5443a2118243e13879f94228524a1a2b92 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 2 Nov 2018 08:53:22 +0100 Subject: KVM: arm/arm64: Fix unintended stage 2 PMD mappings There are two things we need to take care of when we create block mappings in the stage 2 page tables: (1) The alignment within a PMD between the host address range and the guest IPA range must be the same, since otherwise we end up mapping pages with the wrong offset. (2) The head and tail of a memory slot may not cover a full block size, and we have to take care to not map those with block descriptors, since we could expose memory to the guest that the host did not intend to expose. So far, we have been taking care of (1), but not (2), and our commentary describing (1) was somewhat confusing. This commit attempts to factor out the checks of both into a common function, and if we don't pass the check, we won't attempt any PMD mappings for neither hugetlbfs nor THP. Note that we used to only check the alignment for THP, not for hugetlbfs, but as far as I can tell the check needs to be applied to both scenarios. Cc: Ralph Palutke Cc: Lukas Braun Reported-by: Lukas Braun Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 86 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 22 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index f605514395a1..dee3dbd98712 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1595,6 +1595,63 @@ static void kvm_send_hwpoison_signal(unsigned long address, send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); } +static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, + unsigned long hva) +{ + gpa_t gpa_start, gpa_end; + hva_t uaddr_start, uaddr_end; + size_t size; + + size = memslot->npages * PAGE_SIZE; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + gpa_end = gpa_start + size; + + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and IPA cannot be mapped with stage-2 + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 PMDs, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit PMD size mappings + * for the beginning and end of a non-PMD aligned and non-PMD sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & S2_PMD_MASK) >= uaddr_start && + (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -1621,6 +1678,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + if (!fault_supports_stage2_pmd_mappings(memslot, hva)) + force_pte = true; + + if (logging_active) + force_pte = true; + /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(¤t->mm->mmap_sem); vma = find_vma_intersection(current->mm, hva, hva + 1); @@ -1637,28 +1700,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ if ((vma_pagesize == PMD_SIZE || (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) && - !logging_active) { + !force_pte) { gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; - } else { - /* - * Fallback to PTE if it's not one of the Stage 2 - * supported hugepage sizes or the corresponding level - * doesn't exist - */ - vma_pagesize = PAGE_SIZE; - - /* - * Pages belonging to memslots that don't have the same - * alignment for userspace and IPA cannot be mapped using - * block descriptors even if the pages belong to a THP for - * the process, because the stage-2 block descriptor will - * cover more than a single THP and we loose atomicity for - * unmapping, updates, and splits of the THP or other pages - * in the stage-2 block range. - */ - if ((memslot->userspace_addr & ~PMD_MASK) != - ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) - force_pte = true; } up_read(¤t->mm->mmap_sem); @@ -1697,7 +1740,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * should not be mapped with huge pages (it introduces churn * and performance degradation), so force a pte mapping. */ - force_pte = true; flags |= KVM_S2_FLAG_LOGGING_ACTIVE; /* -- cgit v1.2.3 From 58466766cd35754a061414c0c93225db2962948e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 19 Dec 2018 08:28:38 +0000 Subject: arm/arm64: KVM: Add ARM_EXCEPTION_IS_TRAP macro 32 and 64bit use different symbols to identify the traps. 32bit has a fine grained approach (prefetch abort, data abort and HVC), while 64bit is pretty happy with just "trap". This has been fine so far, except that we now need to decode some of that in tracepoints that are common to both architectures. Introduce ARM_EXCEPTION_IS_TRAP which abstracts the trap symbols and make the tracepoint use it. Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index f21f04f8036d..3828beab93f2 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -37,7 +37,7 @@ TRACE_EVENT(kvm_exit, TP_fast_assign( __entry->ret = ARM_EXCEPTION_CODE(ret); - __entry->esr_ec = (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_TRAP) ? esr_ec : 0; + __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; __entry->vcpu_pc = vcpu_pc; ), -- cgit v1.2.3 From f1b9dd5eb86cec1fcf66aad17e7701d98d024a9a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 17 Dec 2018 13:53:33 -0800 Subject: kvm: Disallow wraparound in kvm_gfn_to_hva_cache_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, in the case where (gpa + len) wrapped around, the entire region was not validated, as the comment claimed. It doesn't actually seem that wraparound should be allowed here at all. Furthermore, since some callers don't check the return code from this function, it seems prudent to clear ghc->memslot in the event of an error. Fixes: 8f964525a121f ("KVM: Allow cross page reads and writes from cached translations.") Reported-by: Cfir Cohen Signed-off-by: Jim Mattson Reviewed-by: Cfir Cohen Reviewed-by: Marc Orr Cc: Andrew Honig Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0041947b7390..3be46841db06 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2005,32 +2005,33 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; gfn_t nr_pages_needed = end_gfn - start_gfn + 1; gfn_t nr_pages_avail; + int r = start_gfn <= end_gfn ? 0 : -EINVAL; ghc->gpa = gpa; ghc->generation = slots->generation; ghc->len = len; - ghc->memslot = __gfn_to_memslot(slots, start_gfn); - ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); - if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { + ghc->hva = KVM_HVA_ERR_BAD; + + /* + * If the requested region crosses two memslots, we still + * verify that the entire region is valid here. + */ + while (!r && start_gfn <= end_gfn) { + ghc->memslot = __gfn_to_memslot(slots, start_gfn); + ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, + &nr_pages_avail); + if (kvm_is_error_hva(ghc->hva)) + r = -EFAULT; + start_gfn += nr_pages_avail; + } + + /* Use the slow path for cross page reads and writes. */ + if (!r && nr_pages_needed == 1) ghc->hva += offset; - } else { - /* - * If the requested region crosses two memslots, we still - * verify that the entire region is valid here. - */ - while (start_gfn <= end_gfn) { - nr_pages_avail = 0; - ghc->memslot = __gfn_to_memslot(slots, start_gfn); - ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, - &nr_pages_avail); - if (kvm_is_error_hva(ghc->hva)) - return -EFAULT; - start_gfn += nr_pages_avail; - } - /* Use the slow path for cross page reads and writes. */ + else ghc->memslot = NULL; - } - return 0; + + return r; } int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, -- cgit v1.2.3 From 7a86dab8cf2f0fdf508f3555dddfc236623bff60 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Dec 2018 14:34:43 -0800 Subject: kvm: Change offset in kvm_write_guest_offset_cached to unsigned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the offset is added directly to the hva from the gfn_to_hva_cache, a negative offset could result in an out of bounds write. The existing BUG_ON only checks for addresses beyond the end of the gfn_to_hva_cache, not for addresses before the start of the gfn_to_hva_cache. Note that all current call sites have non-negative offsets. Fixes: 4ec6e8636256 ("kvm: Introduce kvm_write_guest_offset_cached()") Reported-by: Cfir Cohen Signed-off-by: Jim Mattson Reviewed-by: Cfir Cohen Reviewed-by: Peter Shier Reviewed-by: Krish Sadhukhan Reviewed-by: Sean Christopherson Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3be46841db06..f90ceab3840e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2043,7 +2043,8 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len) + void *data, unsigned int offset, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; -- cgit v1.2.3 From bdd303cb1bdb24e71eef8e4510b27166bfadf286 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 5 Nov 2018 14:45:03 +0800 Subject: KVM: fix some typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Wei Yang [Preserved the iff and a probably intentional weird bracket notation. Also dropped the style change to make a single-purpose patch. - Radim] Signed-off-by: Radim Krčmář --- virt/kvm/async_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 23c2519c5b32..110cbe3f74f8 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -82,7 +82,7 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); /* - * This work is run asynchromously to the task which owns + * This work is run asynchronously to the task which owns * mm and might be done in another context, so we must * access remotely. */ -- cgit v1.2.3 From 748c0e312fce983bd7854b369b192e24dce90878 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:10 +0800 Subject: KVM: Make kvm_set_spte_hva() return int The patch is to make kvm_set_spte_hva() return int and caller can check return value to determine flush tlb or not. Signed-off-by: Lan Tianyu Acked-by: Paul Mackerras Signed-off-by: Paolo Bonzini --- virt/kvm/arm/mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index dee3dbd98712..3053bf2584f8 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2049,14 +2049,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; kvm_pfn_t pfn = pte_pfn(pte); pte_t stage2_pte; if (!kvm->arch.pgd) - return; + return 0; trace_kvm_set_spte_hva(hva); @@ -2067,6 +2067,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) clean_dcache_guest_page(pfn, PAGE_SIZE); stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); + + return 0; } static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -- cgit v1.2.3 From 0cf853c5e238edf503ebda2fe541e6f4a3d5bd40 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:11 +0800 Subject: KVM/MMU: Move tlb flush in kvm_set_pte_rmapp() to kvm_mmu_notifier_change_pte() This patch is to move tlb flush in kvm_set_pte_rmapp() to kvm_mmu_notifier_change_pte() in order to avoid redundant tlb flush. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f90ceab3840e..cf7cc0554094 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -354,7 +354,10 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - kvm_set_spte_hva(kvm, address, pte); + + if (kvm_set_spte_hva(kvm, address, pte)) + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } -- cgit v1.2.3