diff options
-rw-r--r-- | Documentation/virt/kvm/api.rst | 53 | ||||
-rw-r--r-- | arch/s390/kvm/diag.c | 2 | ||||
-rw-r--r-- | arch/s390/kvm/gaccess.c | 14 | ||||
-rw-r--r-- | arch/s390/kvm/kvm-s390.c | 4 | ||||
-rw-r--r-- | arch/s390/kvm/priv.c | 4 | ||||
-rw-r--r-- | arch/s390/kvm/sigp.c | 2 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 68 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/xen.c | 315 | ||||
-rw-r--r-- | arch/x86/kvm/xen.h | 18 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 56 | ||||
-rw-r--r-- | include/linux/kvm_types.h | 8 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 59 | ||||
-rw-r--r-- | virt/kvm/pfncache.c | 251 |
16 files changed, 601 insertions, 268 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 09c7e585ff58..0b5a33ee71ee 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -372,7 +372,7 @@ The bits in the dirty bitmap are cleared before the ioctl returns, unless KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, see the description of the capability. -Note that the Xen shared info page, if configured, shall always be assumed +Note that the Xen shared_info page, if configured, shall always be assumed to be dirty. KVM will not explicitly mark it such. @@ -5487,8 +5487,9 @@ KVM_PV_ASYNC_CLEANUP_PERFORM __u8 long_mode; __u8 vector; __u8 runstate_update_flag; - struct { + union { __u64 gfn; + __u64 hva; } shared_info; struct { __u32 send_port; @@ -5516,19 +5517,20 @@ type values: KVM_XEN_ATTR_TYPE_LONG_MODE Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This - determines the layout of the shared info pages exposed to the VM. + determines the layout of the shared_info page exposed to the VM. KVM_XEN_ATTR_TYPE_SHARED_INFO - Sets the guest physical frame number at which the Xen "shared info" + Sets the guest physical frame number at which the Xen shared_info page resides. Note that although Xen places vcpu_info for the first 32 vCPUs in the shared_info page, KVM does not automatically do so - and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used - explicitly even when the vcpu_info for a given vCPU resides at the - "default" location in the shared_info page. This is because KVM may - not be aware of the Xen CPU id which is used as the index into the - vcpu_info[] array, so may know the correct default location. - - Note that the shared info page may be constantly written to by KVM; + and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO or + KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA be used explicitly even when + the vcpu_info for a given vCPU resides at the "default" location + in the shared_info page. This is because KVM may not be aware of + the Xen CPU id which is used as the index into the vcpu_info[] + array, so may know the correct default location. + + Note that the shared_info page may be constantly written to by KVM; it contains the event channel bitmap used to deliver interrupts to a Xen guest, amongst other things. It is exempt from dirty tracking mechanisms — KVM will not explicitly mark the page as dirty each @@ -5537,9 +5539,21 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO any vCPU has been running or any event channel interrupts can be routed to the guest. - Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared info + Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared_info page. +KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA + If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the + Xen capabilities, then this attribute may be used to set the + userspace address at which the shared_info page resides, which + will always be fixed in the VMM regardless of where it is mapped + in guest physical address space. This attribute should be used in + preference to KVM_XEN_ATTR_TYPE_SHARED_INFO as it avoids + unnecessary invalidation of an internal cache when the page is + re-mapped in guest physcial address space. + + Setting the hva to zero will disable the shared_info page. + KVM_XEN_ATTR_TYPE_UPCALL_VECTOR Sets the exception vector used to deliver Xen event channel upcalls. This is the HVM-wide vector injected directly by the hypervisor @@ -5636,6 +5650,21 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO on dirty logging. Setting the gpa to KVM_XEN_INVALID_GPA will disable the vcpu_info. +KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA + If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the + Xen capabilities, then this attribute may be used to set the + userspace address of the vcpu_info for a given vCPU. It should + only be used when the vcpu_info resides at the "default" location + in the shared_info page. In this case it is safe to assume the + userspace address will not change, because the shared_info page is + an overlay on guest memory and remains at a fixed host address + regardless of where it is mapped in guest physical address space + and hence unnecessary invalidation of an internal cache may be + avoided if the guest memory layout is modified. + If the vcpu_info does not reside at the "default" location then + it is not guaranteed to remain at the same host address and + hence the aforementioned cache invalidation is required. + KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO Sets the guest physical address of an additional pvclock structure for a given vCPU. This is typically used for guest vsyscall support. diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3c65b8258ae6..2a32438e09ce 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -102,7 +102,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); vcpu->arch.pfault_token = parm.token_addr; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 5bfcc50c1a68..415c99649e43 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -664,7 +664,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_REGION1: { union region1_table_entry rfte; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &rfte.val)) return -EFAULT; @@ -682,7 +682,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_REGION2: { union region2_table_entry rste; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &rste.val)) return -EFAULT; @@ -700,7 +700,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_REGION3: { union region3_table_entry rtte; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &rtte.val)) return -EFAULT; @@ -728,7 +728,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &ste.val)) return -EFAULT; @@ -748,7 +748,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; } } - if (kvm_is_error_gpa(vcpu->kvm, ptr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) return PGM_ADDRESSING; if (deref_table(vcpu->kvm, ptr, &pte.val)) return -EFAULT; @@ -770,7 +770,7 @@ absolute_address: *prot = PROT_TYPE_IEP; return PGM_PROTECTION; } - if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr)) return PGM_ADDRESSING; *gpa = raddr.addr; return 0; @@ -957,7 +957,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return rc; } else { gpa = kvm_s390_real_to_abs(vcpu, ga); - if (kvm_is_error_gpa(vcpu->kvm, gpa)) { + if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; prot = PROT_NONE; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ea63ac769889..3e5a1d7aa81a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2878,7 +2878,7 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_is_error_gpa(kvm, mop->gaddr)) { + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { r = PGM_ADDRESSING; goto out_unlock; } @@ -2940,7 +2940,7 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_is_error_gpa(kvm, mop->gaddr)) { + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { r = PGM_ADDRESSING; goto out_unlock; } diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index f875a404a0a0..1be19cc9d73c 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -149,7 +149,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) * first page, since address is 8k aligned and memory pieces are always * at least 1MB aligned and have at least a size of 1MB. */ - if (kvm_is_error_gpa(vcpu->kvm, address)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, address)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); kvm_s390_set_prefix(vcpu, address); @@ -464,7 +464,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); addr = kvm_s390_real_to_abs(vcpu, addr); - if (kvm_is_error_gpa(vcpu->kvm, addr)) + if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); /* * We don't expect errors on modern systems, and do not care diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index d9696b530064..55c34cb35428 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -172,7 +172,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, * first page, since address is 8k aligned and memory pieces are always * at least 1MB aligned and have at least a size of 1MB. */ - if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) { + if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) { *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INVALID_PARAMETER; return SIGP_CC_STATUS_STORED; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0ad6bda1fc39..ad29984d5e39 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -549,6 +549,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) struct kvm_xen_hvm_config { __u32 flags; @@ -567,9 +568,10 @@ struct kvm_xen_hvm_attr { __u8 long_mode; __u8 vector; __u8 runstate_update_flag; - struct { + union { __u64 gfn; #define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; } shared_info; struct { __u32 send_port; @@ -611,6 +613,8 @@ struct kvm_xen_hvm_attr { #define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ #define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 struct kvm_xen_vcpu_attr { __u16 type; @@ -618,6 +622,7 @@ struct kvm_xen_vcpu_attr { union { __u64 gpa; #define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; __u64 pad[8]; struct { __u64 state; @@ -648,6 +653,8 @@ struct kvm_xen_vcpu_attr { #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 #define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 #define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 /* Secure Encrypted Virtualization command */ enum sev_cmd_id { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 681f6d82d015..11b9a9bdc07a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -41,6 +41,7 @@ #include "ioapic.h" #include "trace.h" #include "x86.h" +#include "xen.h" #include "cpuid.h" #include "hyperv.h" #include "smm.h" @@ -502,8 +503,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) } /* Check if there are APF page ready requests pending */ - if (enabled) + if (enabled) { kvm_make_request(KVM_REQ_APF_READY, apic->vcpu); + kvm_xen_sw_enable_lapic(apic->vcpu); + } } static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 171ff4d1b7cb..fb2bc9f5fe96 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2854,7 +2854,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, return v * clock->mult; } -static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) +/* + * As with get_kvmclock_base_ns(), this counts from boot time, at the + * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). + */ +static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -2873,6 +2877,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) return mode; } +/* + * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with + * no boot time offset. + */ +static int do_monotonic(s64 *t, u64 *tsc_timestamp) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + ns = gtod->clock.base_cycles; + ns += vgettsc(>od->clock, tsc_timestamp, &mode); + ns >>= gtod->clock.shift; + ns += ktime_to_ns(gtod->clock.offset); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + *t = ns; + + return mode; +} + static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; @@ -2894,18 +2921,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) return mode; } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and + * reports the TSC value from which it do so. Returns true if host is + * using TSC based clocksource. + */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, - tsc_timestamp)); + return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns, + tsc_timestamp)); } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + */ +bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) +{ + /* checked again under seqlock below */ + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) + return false; + + return gtod_is_based_on_tsc(do_monotonic(kernel_ns, + tsc_timestamp)); +} + +/* + * Calculates CLOCK_REALTIME and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + * + * DO NOT USE this for anything related to migration. You want CLOCK_TAI + * for that. + */ static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, u64 *tsc_timestamp) { @@ -3152,7 +3203,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, guest_hv_clock->version = ++vcpu->hv_clock.version; - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); @@ -4674,7 +4725,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_SHARED_INFO | KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND | - KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE | + KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE | KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; @@ -12027,7 +12079,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4dc38092d599..a8b71803777b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); +bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 4b4e738c6f1b..f65b35a05d91 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -10,7 +10,7 @@ #include "x86.h" #include "xen.h" #include "hyperv.h" -#include "lapic.h" +#include "irq.h" #include <linux/eventfd.h> #include <linux/kvm_host.h> @@ -24,6 +24,7 @@ #include <xen/interface/sched.h> #include <asm/xen/cpuid.h> +#include <asm/pvclock.h> #include "cpuid.h" #include "trace.h" @@ -34,41 +35,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); -static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +static int kvm_xen_shared_info_init(struct kvm *kvm) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct pvclock_wall_clock *wc; - gpa_t gpa = gfn_to_gpa(gfn); u32 *wc_sec_hi; u32 wc_version; u64 wall_nsec; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - if (gfn == KVM_XEN_INVALID_GFN) { - kvm_gpc_deactivate(gpc); - goto out; - } + read_lock_irq(&gpc->lock); + while (!kvm_gpc_check(gpc, PAGE_SIZE)) { + read_unlock_irq(&gpc->lock); - do { - ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); + ret = kvm_gpc_refresh(gpc, PAGE_SIZE); if (ret) goto out; - /* - * This code mirrors kvm_write_wall_clock() except that it writes - * directly through the pfn cache and doesn't mark the page dirty. - */ - wall_nsec = kvm_get_wall_clock_epoch(kvm); - - /* It could be invalid again already, so we need to check */ read_lock_irq(&gpc->lock); + } - if (gpc->valid) - break; - - read_unlock_irq(&gpc->lock); - } while (1); + /* + * This code mirrors kvm_write_wall_clock() except that it writes + * directly through the pfn cache and doesn't mark the page dirty. + */ + wall_nsec = kvm_get_wall_clock_epoch(kvm); /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -158,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) +static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, + bool linux_wa) { + int64_t kernel_now, delta; + uint64_t guest_now; + + /* + * The guest provides the requested timeout in absolute nanoseconds + * of the KVM clock — as *it* sees it, based on the scaled TSC and + * the pvclock information provided by KVM. + * + * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW + * so use CLOCK_MONOTONIC. In the timescales covered by timers, the + * difference won't matter much as there is no cumulative effect. + * + * Calculate the time for some arbitrary point in time around "now" + * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the + * delta between the kvmclock "now" value and the guest's requested + * timeout, apply the "Linux workaround" described below, and add + * the resulting delta to the CLOCK_MONOTONIC "now" value, to get + * the absolute CLOCK_MONOTONIC time at which the timer should + * fire. + */ + if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && + static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + uint64_t host_tsc, guest_tsc; + + if (!IS_ENABLED(CONFIG_64BIT) || + !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { + /* + * Don't fall back to get_kvmclock_ns() because it's + * broken; it has a systemic error in its results + * because it scales directly from host TSC to + * nanoseconds, and doesn't scale first to guest TSC + * and *then* to nanoseconds as the guest does. + * + * There is a small error introduced here because time + * continues to elapse between the ktime_get() and the + * subsequent rdtsc(). But not the systemic drift due + * to get_kvmclock_ns(). + */ + kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */ + host_tsc = rdtsc(); + } + + /* Calculate the guest kvmclock as the guest would do it. */ + guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); + guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, + guest_tsc); + } else { + /* + * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. + * + * Also if the guest PV clock hasn't been set up yet, as is + * likely to be the case during migration when the vCPU has + * not been run yet. It would be possible to calculate the + * scaling factors properly in that case but there's not much + * point in doing so. The get_kvmclock_ns() drift accumulates + * over time, so it's OK to use it at startup. Besides, on + * migration there's going to be a little bit of skew in the + * precise moment at which timers fire anyway. Often they'll + * be in the "past" by the time the VM is running again after + * migration. + */ + guest_now = get_kvmclock_ns(vcpu->kvm); + kernel_now = ktime_get(); + } + + delta = guest_abs - guest_now; + + /* + * Xen has a 'Linux workaround' in do_set_timer_op() which checks for + * negative absolute timeout values (caused by integer overflow), and + * for values about 13 days in the future (2^50ns) which would be + * caused by jiffies overflow. For those cases, Xen sets the timeout + * 100ms in the future (not *too* soon, since if a guest really did + * set a long timeout on purpose we don't want to keep churning CPU + * time by waking it up). Emulate Xen's workaround when starting the + * timer in response to __HYPERVISOR_set_timer_op. + */ + if (linux_wa && + unlikely((int64_t)guest_abs < 0 || + (delta > 0 && (uint32_t) (delta >> 50) != 0))) { + delta = 100 * NSEC_PER_MSEC; + guest_abs = guest_now + delta; + } + /* * Avoid races with the old timer firing. Checking timer_expires * to avoid calling hrtimer_cancel() will only have false positives @@ -171,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ atomic_set(&vcpu->arch.xen.timer_pending, 0); vcpu->arch.xen.timer_expires = guest_abs; - if (delta_ns <= 0) { + if (delta <= 0) xen_timer_callback(&vcpu->arch.xen.timer); - } else { - ktime_t ktime_now = ktime_get(); + else hrtimer_start(&vcpu->arch.xen.timer, - ktime_add_ns(ktime_now, delta_ns), + ktime_add_ns(kernel_now, delta), HRTIMER_MODE_ABS_HARD); - } } static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) @@ -452,14 +527,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) smp_wmb(); } - if (user_len2) + if (user_len2) { + kvm_gpc_mark_dirty_in_slot(gpc2); read_unlock(&gpc2->lock); + } + kvm_gpc_mark_dirty_in_slot(gpc1); read_unlock_irqrestore(&gpc1->lock, flags); - - mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT); - if (user_len2) - mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT); } void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) @@ -493,10 +567,9 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); } -static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) +void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) { struct kvm_lapic_irq irq = { }; - int r; irq.dest_id = v->vcpu_id; irq.vector = v->arch.xen.upcall_vector; @@ -505,8 +578,7 @@ static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) irq.delivery_mode = APIC_DM_FIXED; irq.level = 1; - /* The fast version will always work for physical unicast */ - WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); + kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL); } /* @@ -565,13 +637,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) : "0" (evtchn_pending_sel32)); WRITE_ONCE(vi->evtchn_upcall_pending, 1); } + + kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); /* For the per-vCPU lapic vector, deliver it as MSI. */ if (v->arch.xen.upcall_vector) kvm_xen_inject_vcpu_vector(v); - - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); } int __kvm_xen_has_interrupt(struct kvm_vcpu *v) @@ -635,17 +707,59 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) } else { mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; + + /* + * Re-initialize shared_info to put the wallclock in the + * correct place. Whilst it's not necessary to do this + * unless the mode is actually changed, it does no harm + * to make the call anyway. + */ + r = kvm->arch.xen.shinfo_cache.active ? + kvm_xen_shared_info_init(kvm) : 0; mutex_unlock(&kvm->arch.xen.xen_lock); - r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { + int idx; + mutex_lock(&kvm->arch.xen.xen_lock); - r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); + + idx = srcu_read_lock(&kvm->srcu); + + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { + gfn_t gfn = data->u.shared_info.gfn; + + if (gfn == KVM_XEN_INVALID_GFN) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, + gfn_to_gpa(gfn), PAGE_SIZE); + } + } else { + void __user * hva = u64_to_user_ptr(data->u.shared_info.hva); + + if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) { + r = -EINVAL; + } else if (!hva) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, + (unsigned long)hva, PAGE_SIZE); + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + if (!r && kvm->arch.xen.shinfo_cache.active) + r = kvm_xen_shared_info_init(kvm); + mutex_unlock(&kvm->arch.xen.xen_lock); break; - + } case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; @@ -699,13 +813,21 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_cache.active) + if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache)) data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); else data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; r = 0; break; + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: + if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache)) + data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva; + else + data->u.shared_info.hva = 0; + r = 0; + break; + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: data->u.vector = kvm->arch.xen.upcall_vector; r = 0; @@ -742,20 +864,33 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: /* No compat necessary here. */ BUILD_BUG_ON(sizeof(struct vcpu_info) != sizeof(struct compat_vcpu_info)); BUILD_BUG_ON(offsetof(struct vcpu_info, time) != offsetof(struct compat_vcpu_info, time)); - if (data->u.gpa == KVM_XEN_INVALID_GPA) { - kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); - r = 0; - break; + if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) { + if (data->u.gpa == KVM_XEN_INVALID_GPA) { + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + r = 0; + break; + } + + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, sizeof(struct vcpu_info)); + } else { + if (data->u.hva == 0) { + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + r = 0; + break; + } + + r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache, + data->u.hva, sizeof(struct vcpu_info)); } - r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, - data->u.gpa, sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -944,9 +1079,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) /* Start the timer if the new value has a valid vector+expiry. */ if (data->u.timer.port && data->u.timer.expires_ns) - kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, - data->u.timer.expires_ns - - get_kvmclock_ns(vcpu->kvm)); + kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false); r = 0; break; @@ -977,13 +1110,21 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: - if (vcpu->arch.xen.vcpu_info_cache.active) + if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache)) data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; else data->u.gpa = KVM_XEN_INVALID_GPA; r = 0; break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: + if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache)) + data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva; + else + data->u.hva = 0; + r = 0; + break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (vcpu->arch.xen.vcpu_time_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; @@ -1093,9 +1234,24 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; bool lm = is_long_mode(vcpu); + int r = 0; + + mutex_lock(&kvm->arch.xen.xen_lock); + if (kvm->arch.xen.long_mode != lm) { + kvm->arch.xen.long_mode = lm; + + /* + * Re-initialize shared_info to put the wallclock in the + * correct place. + */ + if (kvm->arch.xen.shinfo_cache.active && + kvm_xen_shared_info_init(kvm)) + r = 1; + } + mutex_unlock(&kvm->arch.xen.xen_lock); - /* Latch long_mode for shared_info pages etc. */ - vcpu->kvm->arch.xen.long_mode = lm; + if (r) + return r; /* * If Xen hypercall intercept is enabled, fill the hypercall @@ -1396,7 +1552,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, { struct vcpu_set_singleshot_timer oneshot; struct x86_exception e; - s64 delta; if |