summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2024-03-11 10:42:55 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2024-03-11 10:42:55 -0400
commite9a2bba476c8332ed547fce485c158d03b0b9659 (patch)
tree243f63d00ab1ea67d492fed6661ad22451cc9df4
parente9025cdd8c5c17e97949423856aced1d6f31c62f (diff)
parent7a36d680658ba5a0d350f2ad275b97156b8d4333 (diff)
downloadlinux-e9a2bba476c8332ed547fce485c158d03b0b9659.tar.gz
linux-e9a2bba476c8332ed547fce485c158d03b0b9659.tar.bz2
linux-e9a2bba476c8332ed547fce485c158d03b0b9659.zip
Merge tag 'kvm-x86-xen-6.9' of https://github.com/kvm-x86/linux into HEAD
KVM Xen and pfncache changes for 6.9: - Rip out the half-baked support for using gfn_to_pfn caches to manage pages that are "mapped" into guests via physical addresses. - Add support for using gfn_to_pfn caches with only a host virtual address, i.e. to bypass the "gfn" stage of the cache. The primary use case is overlay pages, where the guest may change the gfn used to reference the overlay page, but the backing hva+pfn remains the same. - Add an ioctl() to allow mapping Xen's shared_info page using an hva instead of a gpa, so that userspace doesn't need to reconfigure and invalidate the cache/mapping if the guest changes the gpa (but userspace keeps the resolved hva the same). - When possible, use a single host TSC value when computing the deadline for Xen timers in order to improve the accuracy of the timer emulation. - Inject pending upcall events when the vCPU software-enables its APIC to fix a bug where an upcall can be lost (and to follow Xen's behavior). - Fall back to the slow path instead of warning if "fast" IRQ delivery of Xen events fails, e.g. if the guest has aliased xAPIC IDs. - Extend gfn_to_pfn_cache's mutex to cover (de)activation (in addition to refresh), and drop a now-redundant acquisition of xen_lock (that was protecting the shared_info cache) to fix a deadlock due to recursively acquiring xen_lock.
-rw-r--r--Documentation/virt/kvm/api.rst53
-rw-r--r--arch/s390/kvm/diag.c2
-rw-r--r--arch/s390/kvm/gaccess.c14
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/s390/kvm/priv.c4
-rw-r--r--arch/s390/kvm/sigp.c2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h9
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/x86.c68
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/kvm/xen.c315
-rw-r--r--arch/x86/kvm/xen.h18
-rw-r--r--include/linux/kvm_host.h56
-rw-r--r--include/linux/kvm_types.h8
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c59
-rw-r--r--virt/kvm/pfncache.c251
16 files changed, 601 insertions, 268 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 09c7e585ff58..0b5a33ee71ee 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -372,7 +372,7 @@ The bits in the dirty bitmap are cleared before the ioctl returns, unless
KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information,
see the description of the capability.
-Note that the Xen shared info page, if configured, shall always be assumed
+Note that the Xen shared_info page, if configured, shall always be assumed
to be dirty. KVM will not explicitly mark it such.
@@ -5487,8 +5487,9 @@ KVM_PV_ASYNC_CLEANUP_PERFORM
__u8 long_mode;
__u8 vector;
__u8 runstate_update_flag;
- struct {
+ union {
__u64 gfn;
+ __u64 hva;
} shared_info;
struct {
__u32 send_port;
@@ -5516,19 +5517,20 @@ type values:
KVM_XEN_ATTR_TYPE_LONG_MODE
Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This
- determines the layout of the shared info pages exposed to the VM.
+ determines the layout of the shared_info page exposed to the VM.
KVM_XEN_ATTR_TYPE_SHARED_INFO
- Sets the guest physical frame number at which the Xen "shared info"
+ Sets the guest physical frame number at which the Xen shared_info
page resides. Note that although Xen places vcpu_info for the first
32 vCPUs in the shared_info page, KVM does not automatically do so
- and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used
- explicitly even when the vcpu_info for a given vCPU resides at the
- "default" location in the shared_info page. This is because KVM may
- not be aware of the Xen CPU id which is used as the index into the
- vcpu_info[] array, so may know the correct default location.
-
- Note that the shared info page may be constantly written to by KVM;
+ and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO or
+ KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA be used explicitly even when
+ the vcpu_info for a given vCPU resides at the "default" location
+ in the shared_info page. This is because KVM may not be aware of
+ the Xen CPU id which is used as the index into the vcpu_info[]
+ array, so may know the correct default location.
+
+ Note that the shared_info page may be constantly written to by KVM;
it contains the event channel bitmap used to deliver interrupts to
a Xen guest, amongst other things. It is exempt from dirty tracking
mechanisms — KVM will not explicitly mark the page as dirty each
@@ -5537,9 +5539,21 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
any vCPU has been running or any event channel interrupts can be
routed to the guest.
- Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared info
+ Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared_info
page.
+KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA
+ If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the
+ Xen capabilities, then this attribute may be used to set the
+ userspace address at which the shared_info page resides, which
+ will always be fixed in the VMM regardless of where it is mapped
+ in guest physical address space. This attribute should be used in
+ preference to KVM_XEN_ATTR_TYPE_SHARED_INFO as it avoids
+ unnecessary invalidation of an internal cache when the page is
+ re-mapped in guest physcial address space.
+
+ Setting the hva to zero will disable the shared_info page.
+
KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
Sets the exception vector used to deliver Xen event channel upcalls.
This is the HVM-wide vector injected directly by the hypervisor
@@ -5636,6 +5650,21 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
on dirty logging. Setting the gpa to KVM_XEN_INVALID_GPA will disable
the vcpu_info.
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA
+ If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the
+ Xen capabilities, then this attribute may be used to set the
+ userspace address of the vcpu_info for a given vCPU. It should
+ only be used when the vcpu_info resides at the "default" location
+ in the shared_info page. In this case it is safe to assume the
+ userspace address will not change, because the shared_info page is
+ an overlay on guest memory and remains at a fixed host address
+ regardless of where it is mapped in guest physical address space
+ and hence unnecessary invalidation of an internal cache may be
+ avoided if the guest memory layout is modified.
+ If the vcpu_info does not reside at the "default" location then
+ it is not guaranteed to remain at the same host address and
+ hence the aforementioned cache invalidation is required.
+
KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
Sets the guest physical address of an additional pvclock structure
for a given vCPU. This is typically used for guest vsyscall support.
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3c65b8258ae6..2a32438e09ce 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -102,7 +102,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
vcpu->arch.pfault_token = parm.token_addr;
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 5bfcc50c1a68..415c99649e43 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -664,7 +664,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION1: {
union region1_table_entry rfte;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rfte.val))
return -EFAULT;
@@ -682,7 +682,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rste.val))
return -EFAULT;
@@ -700,7 +700,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rtte.val))
return -EFAULT;
@@ -728,7 +728,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &ste.val))
return -EFAULT;
@@ -748,7 +748,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8;
}
}
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &pte.val))
return -EFAULT;
@@ -770,7 +770,7 @@ absolute_address:
*prot = PROT_TYPE_IEP;
return PGM_PROTECTION;
}
- if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr))
return PGM_ADDRESSING;
*gpa = raddr.addr;
return 0;
@@ -957,7 +957,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
return rc;
} else {
gpa = kvm_s390_real_to_abs(vcpu, ga);
- if (kvm_is_error_gpa(vcpu->kvm, gpa)) {
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
rc = PGM_ADDRESSING;
prot = PROT_NONE;
}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ea63ac769889..3e5a1d7aa81a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2878,7 +2878,7 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
r = PGM_ADDRESSING;
goto out_unlock;
}
@@ -2940,7 +2940,7 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m
srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
r = PGM_ADDRESSING;
goto out_unlock;
}
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f875a404a0a0..1be19cc9d73c 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -149,7 +149,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
* first page, since address is 8k aligned and memory pieces are always
* at least 1MB aligned and have at least a size of 1MB.
*/
- if (kvm_is_error_gpa(vcpu->kvm, address))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, address))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
kvm_s390_set_prefix(vcpu, address);
@@ -464,7 +464,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
addr = kvm_s390_real_to_abs(vcpu, addr);
- if (kvm_is_error_gpa(vcpu->kvm, addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
/*
* We don't expect errors on modern systems, and do not care
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index d9696b530064..55c34cb35428 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -172,7 +172,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
* first page, since address is 8k aligned and memory pieces are always
* at least 1MB aligned and have at least a size of 1MB.
*/
- if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) {
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) {
*reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_INVALID_PARAMETER;
return SIGP_CC_STATUS_STORED;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 0ad6bda1fc39..ad29984d5e39 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -549,6 +549,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8)
struct kvm_xen_hvm_config {
__u32 flags;
@@ -567,9 +568,10 @@ struct kvm_xen_hvm_attr {
__u8 long_mode;
__u8 vector;
__u8 runstate_update_flag;
- struct {
+ union {
__u64 gfn;
#define KVM_XEN_INVALID_GFN ((__u64)-1)
+ __u64 hva;
} shared_info;
struct {
__u32 send_port;
@@ -611,6 +613,8 @@ struct kvm_xen_hvm_attr {
#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4
/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6
struct kvm_xen_vcpu_attr {
__u16 type;
@@ -618,6 +622,7 @@ struct kvm_xen_vcpu_attr {
union {
__u64 gpa;
#define KVM_XEN_INVALID_GPA ((__u64)-1)
+ __u64 hva;
__u64 pad[8];
struct {
__u64 state;
@@ -648,6 +653,8 @@ struct kvm_xen_vcpu_attr {
#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6
#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7
#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 681f6d82d015..11b9a9bdc07a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
#include "cpuid.h"
#include "hyperv.h"
#include "smm.h"
@@ -502,8 +503,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
/* Check if there are APF page ready requests pending */
- if (enabled)
+ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 171ff4d1b7cb..fb2bc9f5fe96 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2854,7 +2854,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
return v * clock->mult;
}
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -2873,6 +2877,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
return mode;
}
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += ktime_to_ns(gtod->clock.offset);
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2894,18 +2921,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
return mode;
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
/* checked again under seqlock below */
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
- tsc_timestamp));
+ return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+ tsc_timestamp));
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+ tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
u64 *tsc_timestamp)
{
@@ -3152,7 +3203,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
guest_hv_clock->version = ++vcpu->hv_clock.version;
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
@@ -4674,7 +4725,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
- KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -12027,7 +12079,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 4dc38092d599..a8b71803777b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b4e738c6f1b..f65b35a05d91 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -10,7 +10,7 @@
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
-#include "lapic.h"
+#include "irq.h"
#include <linux/eventfd.h>
#include <linux/kvm_host.h>
@@ -24,6 +24,7 @@
#include <xen/interface/sched.h>
#include <asm/xen/cpuid.h>
+#include <asm/pvclock.h>
#include "cpuid.h"
#include "trace.h"
@@ -34,41 +35,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
-static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+static int kvm_xen_shared_info_init(struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct pvclock_wall_clock *wc;
- gpa_t gpa = gfn_to_gpa(gfn);
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (gfn == KVM_XEN_INVALID_GFN) {
- kvm_gpc_deactivate(gpc);
- goto out;
- }
+ read_lock_irq(&gpc->lock);
+ while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+ read_unlock_irq(&gpc->lock);
- do {
- ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+ ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
if (ret)
goto out;
- /*
- * This code mirrors kvm_write_wall_clock() except that it writes
- * directly through the pfn cache and doesn't mark the page dirty.
- */
- wall_nsec = kvm_get_wall_clock_epoch(kvm);
-
- /* It could be invalid again already, so we need to check */
read_lock_irq(&gpc->lock);
+ }
- if (gpc->valid)
- break;
-
- read_unlock_irq(&gpc->lock);
- } while (1);
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -158,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
+ bool linux_wa)
{
+ int64_t kernel_now, delta;
+ uint64_t guest_now;
+
+ /*
+ * The guest provides the requested timeout in absolute nanoseconds
+ * of the KVM clock — as *it* sees it, based on the scaled TSC and
+ * the pvclock information provided by KVM.
+ *
+ * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
+ * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
+ * difference won't matter much as there is no cumulative effect.
+ *
+ * Calculate the time for some arbitrary point in time around "now"
+ * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
+ * delta between the kvmclock "now" value and the guest's requested
+ * timeout, apply the "Linux workaround" described below, and add
+ * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
+ * the absolute CLOCK_MONOTONIC time at which the timer should
+ * fire.
+ */
+ if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
+ static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ uint64_t host_tsc, guest_tsc;
+
+ if (!IS_ENABLED(CONFIG_64BIT) ||
+ !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
+ /*
+ * Don't fall back to get_kvmclock_ns() because it's
+ * broken; it has a systemic error in its results
+ * because it scales directly from host TSC to
+ * nanoseconds, and doesn't scale first to guest TSC
+ * and *then* to nanoseconds as the guest does.
+ *
+ * There is a small error introduced here because time
+ * continues to elapse between the ktime_get() and the
+ * subsequent rdtsc(). But not the systemic drift due
+ * to get_kvmclock_ns().
+ */
+ kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
+ host_tsc = rdtsc();
+ }
+
+ /* Calculate the guest kvmclock as the guest would do it. */
+ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
+ guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
+ guest_tsc);
+ } else {
+ /*
+ * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
+ *
+ * Also if the guest PV clock hasn't been set up yet, as is
+ * likely to be the case during migration when the vCPU has
+ * not been run yet. It would be possible to calculate the
+ * scaling factors properly in that case but there's not much
+ * point in doing so. The get_kvmclock_ns() drift accumulates
+ * over time, so it's OK to use it at startup. Besides, on
+ * migration there's going to be a little bit of skew in the
+ * precise moment at which timers fire anyway. Often they'll
+ * be in the "past" by the time the VM is running again after
+ * migration.
+ */
+ guest_now = get_kvmclock_ns(vcpu->kvm);
+ kernel_now = ktime_get();
+ }
+
+ delta = guest_abs - guest_now;
+
+ /*
+ * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
+ * negative absolute timeout values (caused by integer overflow), and
+ * for values about 13 days in the future (2^50ns) which would be
+ * caused by jiffies overflow. For those cases, Xen sets the timeout
+ * 100ms in the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep churning CPU
+ * time by waking it up). Emulate Xen's workaround when starting the
+ * timer in response to __HYPERVISOR_set_timer_op.
+ */
+ if (linux_wa &&
+ unlikely((int64_t)guest_abs < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ guest_abs = guest_now + delta;
+ }
+
/*
* Avoid races with the old timer firing. Checking timer_expires
* to avoid calling hrtimer_cancel() will only have false positives
@@ -171,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_
atomic_set(&vcpu->arch.xen.timer_pending, 0);
vcpu->arch.xen.timer_expires = guest_abs;
- if (delta_ns <= 0) {
+ if (delta <= 0)
xen_timer_callback(&vcpu->arch.xen.timer);
- } else {
- ktime_t ktime_now = ktime_get();
+ else
hrtimer_start(&vcpu->arch.xen.timer,
- ktime_add_ns(ktime_now, delta_ns),
+ ktime_add_ns(kernel_now, delta),
HRTIMER_MODE_ABS_HARD);
- }
}
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
@@ -452,14 +527,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}
- if (user_len2)
+ if (user_len2) {
+ kvm_gpc_mark_dirty_in_slot(gpc2);
read_unlock(&gpc2->lock);
+ }
+ kvm_gpc_mark_dirty_in_slot(gpc1);
read_unlock_irqrestore(&gpc1->lock, flags);
-
- mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
- if (user_len2)
- mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
}
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -493,10 +567,9 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
-static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
- int r;
irq.dest_id = v->vcpu_id;
irq.vector = v->arch.xen.upcall_vector;
@@ -505,8 +578,7 @@ static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
irq.delivery_mode = APIC_DM_FIXED;
irq.level = 1;
- /* The fast version will always work for physical unicast */
- WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+ kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
}
/*
@@ -565,13 +637,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
: "0" (evtchn_pending_sel32));
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
kvm_xen_inject_vcpu_vector(v);
-
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -635,17 +707,59 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
} else {
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place. Whilst it's not necessary to do this
+ * unless the mode is actually changed, it does no harm
+ * to make the call anyway.
+ */
+ r = kvm->arch.xen.shinfo_cache.active ?
+ kvm_xen_shared_info_init(kvm) : 0;
mutex_unlock(&kvm->arch.xen.xen_lock);
- r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
+ int idx;
+
mutex_lock(&kvm->arch.xen.xen_lock);
- r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
+ gfn_t gfn = data->u.shared_info.gfn;
+
+ if (gfn == KVM_XEN_INVALID_GFN) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
+ gfn_to_gpa(gfn), PAGE_SIZE);
+ }
+ } else {
+ void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
+
+ if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+ r = -EINVAL;
+ } else if (!hva) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
+ (unsigned long)hva, PAGE_SIZE);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!r && kvm->arch.xen.shinfo_cache.active)
+ r = kvm_xen_shared_info_init(kvm);
+
mutex_unlock(&kvm->arch.xen.xen_lock);
break;
-
+ }
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
@@ -699,13 +813,21 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (kvm->arch.xen.shinfo_cache.active)
+ if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
+ data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
+ else
+ data->u.shared_info.hva = 0;
+ r = 0;
+ break;
+
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
data->u.vector = kvm->arch.xen.upcall_vector;
r = 0;
@@ -742,20 +864,33 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
/* No compat necessary here. */
BUILD_BUG_ON(sizeof(struct vcpu_info) !=
sizeof(struct compat_vcpu_info));
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
offsetof(struct compat_vcpu_info, time));
- if (data->u.gpa == KVM_XEN_INVALID_GPA) {
- kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
- r = 0;
- break;
+ if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
+ } else {
+ if (data->u.hva == 0) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.hva, sizeof(struct vcpu_info));
}
- r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa, sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -944,9 +1079,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
/* Start the timer if the new value has a valid vector+expiry. */
if (data->u.timer.port &&