summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst53
-rw-r--r--arch/s390/kvm/diag.c2
-rw-r--r--arch/s390/kvm/gaccess.c14
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/s390/kvm/priv.c4
-rw-r--r--arch/s390/kvm/sigp.c2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h9
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/x86.c68
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/kvm/xen.c315
-rw-r--r--arch/x86/kvm/xen.h18
-rw-r--r--include/linux/kvm_host.h56
-rw-r--r--include/linux/kvm_types.h8
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c59
-rw-r--r--virt/kvm/pfncache.c251
16 files changed, 601 insertions, 268 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 09c7e585ff58..0b5a33ee71ee 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -372,7 +372,7 @@ The bits in the dirty bitmap are cleared before the ioctl returns, unless
KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information,
see the description of the capability.
-Note that the Xen shared info page, if configured, shall always be assumed
+Note that the Xen shared_info page, if configured, shall always be assumed
to be dirty. KVM will not explicitly mark it such.
@@ -5487,8 +5487,9 @@ KVM_PV_ASYNC_CLEANUP_PERFORM
__u8 long_mode;
__u8 vector;
__u8 runstate_update_flag;
- struct {
+ union {
__u64 gfn;
+ __u64 hva;
} shared_info;
struct {
__u32 send_port;
@@ -5516,19 +5517,20 @@ type values:
KVM_XEN_ATTR_TYPE_LONG_MODE
Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This
- determines the layout of the shared info pages exposed to the VM.
+ determines the layout of the shared_info page exposed to the VM.
KVM_XEN_ATTR_TYPE_SHARED_INFO
- Sets the guest physical frame number at which the Xen "shared info"
+ Sets the guest physical frame number at which the Xen shared_info
page resides. Note that although Xen places vcpu_info for the first
32 vCPUs in the shared_info page, KVM does not automatically do so
- and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used
- explicitly even when the vcpu_info for a given vCPU resides at the
- "default" location in the shared_info page. This is because KVM may
- not be aware of the Xen CPU id which is used as the index into the
- vcpu_info[] array, so may know the correct default location.
-
- Note that the shared info page may be constantly written to by KVM;
+ and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO or
+ KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA be used explicitly even when
+ the vcpu_info for a given vCPU resides at the "default" location
+ in the shared_info page. This is because KVM may not be aware of
+ the Xen CPU id which is used as the index into the vcpu_info[]
+ array, so may know the correct default location.
+
+ Note that the shared_info page may be constantly written to by KVM;
it contains the event channel bitmap used to deliver interrupts to
a Xen guest, amongst other things. It is exempt from dirty tracking
mechanisms — KVM will not explicitly mark the page as dirty each
@@ -5537,9 +5539,21 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
any vCPU has been running or any event channel interrupts can be
routed to the guest.
- Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared info
+ Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared_info
page.
+KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA
+ If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the
+ Xen capabilities, then this attribute may be used to set the
+ userspace address at which the shared_info page resides, which
+ will always be fixed in the VMM regardless of where it is mapped
+ in guest physical address space. This attribute should be used in
+ preference to KVM_XEN_ATTR_TYPE_SHARED_INFO as it avoids
+ unnecessary invalidation of an internal cache when the page is
+ re-mapped in guest physcial address space.
+
+ Setting the hva to zero will disable the shared_info page.
+
KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
Sets the exception vector used to deliver Xen event channel upcalls.
This is the HVM-wide vector injected directly by the hypervisor
@@ -5636,6 +5650,21 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
on dirty logging. Setting the gpa to KVM_XEN_INVALID_GPA will disable
the vcpu_info.
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA
+ If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the
+ Xen capabilities, then this attribute may be used to set the
+ userspace address of the vcpu_info for a given vCPU. It should
+ only be used when the vcpu_info resides at the "default" location
+ in the shared_info page. In this case it is safe to assume the
+ userspace address will not change, because the shared_info page is
+ an overlay on guest memory and remains at a fixed host address
+ regardless of where it is mapped in guest physical address space
+ and hence unnecessary invalidation of an internal cache may be
+ avoided if the guest memory layout is modified.
+ If the vcpu_info does not reside at the "default" location then
+ it is not guaranteed to remain at the same host address and
+ hence the aforementioned cache invalidation is required.
+
KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
Sets the guest physical address of an additional pvclock structure
for a given vCPU. This is typically used for guest vsyscall support.
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3c65b8258ae6..2a32438e09ce 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -102,7 +102,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
vcpu->arch.pfault_token = parm.token_addr;
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 5bfcc50c1a68..415c99649e43 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -664,7 +664,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION1: {
union region1_table_entry rfte;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rfte.val))
return -EFAULT;
@@ -682,7 +682,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rste.val))
return -EFAULT;
@@ -700,7 +700,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rtte.val))
return -EFAULT;
@@ -728,7 +728,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &ste.val))
return -EFAULT;
@@ -748,7 +748,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8;
}
}
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &pte.val))
return -EFAULT;
@@ -770,7 +770,7 @@ absolute_address:
*prot = PROT_TYPE_IEP;
return PGM_PROTECTION;
}
- if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr))
return PGM_ADDRESSING;
*gpa = raddr.addr;
return 0;
@@ -957,7 +957,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
return rc;
} else {
gpa = kvm_s390_real_to_abs(vcpu, ga);
- if (kvm_is_error_gpa(vcpu->kvm, gpa)) {
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
rc = PGM_ADDRESSING;
prot = PROT_NONE;
}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ea63ac769889..3e5a1d7aa81a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2878,7 +2878,7 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
r = PGM_ADDRESSING;
goto out_unlock;
}
@@ -2940,7 +2940,7 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m
srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
r = PGM_ADDRESSING;
goto out_unlock;
}
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f875a404a0a0..1be19cc9d73c 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -149,7 +149,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
* first page, since address is 8k aligned and memory pieces are always
* at least 1MB aligned and have at least a size of 1MB.
*/
- if (kvm_is_error_gpa(vcpu->kvm, address))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, address))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
kvm_s390_set_prefix(vcpu, address);
@@ -464,7 +464,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
addr = kvm_s390_real_to_abs(vcpu, addr);
- if (kvm_is_error_gpa(vcpu->kvm, addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
/*
* We don't expect errors on modern systems, and do not care
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index d9696b530064..55c34cb35428 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -172,7 +172,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
* first page, since address is 8k aligned and memory pieces are always
* at least 1MB aligned and have at least a size of 1MB.
*/
- if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) {
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) {
*reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_INVALID_PARAMETER;
return SIGP_CC_STATUS_STORED;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 0ad6bda1fc39..ad29984d5e39 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -549,6 +549,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8)
struct kvm_xen_hvm_config {
__u32 flags;
@@ -567,9 +568,10 @@ struct kvm_xen_hvm_attr {
__u8 long_mode;
__u8 vector;
__u8 runstate_update_flag;
- struct {
+ union {
__u64 gfn;
#define KVM_XEN_INVALID_GFN ((__u64)-1)
+ __u64 hva;
} shared_info;
struct {
__u32 send_port;
@@ -611,6 +613,8 @@ struct kvm_xen_hvm_attr {
#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4
/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6
struct kvm_xen_vcpu_attr {
__u16 type;
@@ -618,6 +622,7 @@ struct kvm_xen_vcpu_attr {
union {
__u64 gpa;
#define KVM_XEN_INVALID_GPA ((__u64)-1)
+ __u64 hva;
__u64 pad[8];
struct {
__u64 state;
@@ -648,6 +653,8 @@ struct kvm_xen_vcpu_attr {
#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6
#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7
#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 681f6d82d015..11b9a9bdc07a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
#include "cpuid.h"
#include "hyperv.h"
#include "smm.h"
@@ -502,8 +503,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
/* Check if there are APF page ready requests pending */
- if (enabled)
+ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 171ff4d1b7cb..fb2bc9f5fe96 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2854,7 +2854,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
return v * clock->mult;
}
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -2873,6 +2877,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
return mode;
}
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += ktime_to_ns(gtod->clock.offset);
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2894,18 +2921,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
return mode;
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
/* checked again under seqlock below */
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
- tsc_timestamp));
+ return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+ tsc_timestamp));
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+ tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
u64 *tsc_timestamp)
{
@@ -3152,7 +3203,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
guest_hv_clock->version = ++vcpu->hv_clock.version;
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
@@ -4674,7 +4725,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
- KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -12027,7 +12079,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 4dc38092d599..a8b71803777b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b4e738c6f1b..f65b35a05d91 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -10,7 +10,7 @@
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
-#include "lapic.h"
+#include "irq.h"
#include <linux/eventfd.h>
#include <linux/kvm_host.h>
@@ -24,6 +24,7 @@
#include <xen/interface/sched.h>
#include <asm/xen/cpuid.h>
+#include <asm/pvclock.h>
#include "cpuid.h"
#include "trace.h"
@@ -34,41 +35,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
-static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+static int kvm_xen_shared_info_init(struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct pvclock_wall_clock *wc;
- gpa_t gpa = gfn_to_gpa(gfn);
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (gfn == KVM_XEN_INVALID_GFN) {
- kvm_gpc_deactivate(gpc);
- goto out;
- }
+ read_lock_irq(&gpc->lock);
+ while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+ read_unlock_irq(&gpc->lock);
- do {
- ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+ ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
if (ret)
goto out;
- /*
- * This code mirrors kvm_write_wall_clock() except that it writes
- * directly through the pfn cache and doesn't mark the page dirty.
- */
- wall_nsec = kvm_get_wall_clock_epoch(kvm);
-
- /* It could be invalid again already, so we need to check */
read_lock_irq(&gpc->lock);
+ }
- if (gpc->valid)
- break;
-
- read_unlock_irq(&gpc->lock);
- } while (1);
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -158,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
+ bool linux_wa)
{
+ int64_t kernel_now, delta;
+ uint64_t guest_now;
+
+ /*
+ * The guest provides the requested timeout in absolute nanoseconds
+ * of the KVM clock — as *it* sees it, based on the scaled TSC and
+ * the pvclock information provided by KVM.
+ *
+ * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
+ * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
+ * difference won't matter much as there is no cumulative effect.
+ *
+ * Calculate the time for some arbitrary point in time around "now"
+ * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
+ * delta between the kvmclock "now" value and the guest's requested
+ * timeout, apply the "Linux workaround" described below, and add
+ * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
+ * the absolute CLOCK_MONOTONIC time at which the timer should
+ * fire.
+ */
+ if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
+ static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ uint64_t host_tsc, guest_tsc;
+
+ if (!IS_ENABLED(CONFIG_64BIT) ||
+ !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
+ /*
+ * Don't fall back to get_kvmclock_ns() because it's
+ * broken; it has a systemic error in its results
+ * because it scales directly from host TSC to
+ * nanoseconds, and doesn't scale first to guest TSC
+ * and *then* to nanoseconds as the guest does.
+ *
+ * There is a small error introduced here because time
+ * continues to elapse between the ktime_get() and the
+ * subsequent rdtsc(). But not the systemic drift due
+ * to get_kvmclock_ns().
+ */
+ kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
+ host_tsc = rdtsc();
+ }
+
+ /* Calculate the guest kvmclock as the guest would do it. */
+ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
+ guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
+ guest_tsc);
+ } else {
+ /*
+ * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
+ *
+ * Also if the guest PV clock hasn't been set up yet, as is
+ * likely to be the case during migration when the vCPU has
+ * not been run yet. It would be possible to calculate the
+ * scaling factors properly in that case but there's not much
+ * point in doing so. The get_kvmclock_ns() drift accumulates
+ * over time, so it's OK to use it at startup. Besides, on
+ * migration there's going to be a little bit of skew in the
+ * precise moment at which timers fire anyway. Often they'll
+ * be in the "past" by the time the VM is running again after
+ * migration.
+ */
+ guest_now = get_kvmclock_ns(vcpu->kvm);
+ kernel_now = ktime_get();
+ }
+
+ delta = guest_abs - guest_now;
+
+ /*
+ * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
+ * negative absolute timeout values (caused by integer overflow), and
+ * for values about 13 days in the future (2^50ns) which would be
+ * caused by jiffies overflow. For those cases, Xen sets the timeout
+ * 100ms in the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep churning CPU
+ * time by waking it up). Emulate Xen's workaround when starting the
+ * timer in response to __HYPERVISOR_set_timer_op.
+ */
+ if (linux_wa &&
+ unlikely((int64_t)guest_abs < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ guest_abs = guest_now + delta;
+ }
+
/*
* Avoid races with the old timer firing. Checking timer_expires
* to avoid calling hrtimer_cancel() will only have false positives
@@ -171,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_
atomic_set(&vcpu->arch.xen.timer_pending, 0);
vcpu->arch.xen.timer_expires = guest_abs;
- if (delta_ns <= 0) {
+ if (delta <= 0)
xen_timer_callback(&vcpu->arch.xen.timer);
- } else {
- ktime_t ktime_now = ktime_get();
+ else
hrtimer_start(&vcpu->arch.xen.timer,
- ktime_add_ns(ktime_now, delta_ns),
+ ktime_add_ns(kernel_now, delta),
HRTIMER_MODE_ABS_HARD);
- }
}
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
@@ -452,14 +527,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}
- if (user_len2)
+ if (user_len2) {
+ kvm_gpc_mark_dirty_in_slot(gpc2);
read_unlock(&gpc2->lock);
+ }
+ kvm_gpc_mark_dirty_in_slot(gpc1);
read_unlock_irqrestore(&gpc1->lock, flags);
-
- mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
- if (user_len2)
- mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
}
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -493,10 +567,9 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
-static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
- int r;
irq.dest_id = v->vcpu_id;
irq.vector = v->arch.xen.upcall_vector;
@@ -505,8 +578,7 @@ static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
irq.delivery_mode = APIC_DM_FIXED;
irq.level = 1;
- /* The fast version will always work for physical unicast */
- WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+ kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
}
/*
@@ -565,13 +637,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
: "0" (evtchn_pending_sel32));
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
kvm_xen_inject_vcpu_vector(v);
-
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -635,17 +707,59 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
} else {
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place. Whilst it's not necessary to do this
+ * unless the mode is actually changed, it does no harm
+ * to make the call anyway.
+ */
+ r = kvm->arch.xen.shinfo_cache.active ?
+ kvm_xen_shared_info_init(kvm) : 0;
mutex_unlock(&kvm->arch.xen.xen_lock);
- r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
+ int idx;
+
mutex_lock(&kvm->arch.xen.xen_lock);
- r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
+ gfn_t gfn = data->u.shared_info.gfn;
+
+ if (gfn == KVM_XEN_INVALID_GFN) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
+ gfn_to_gpa(gfn), PAGE_SIZE);
+ }
+ } else {
+ void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
+
+ if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+ r = -EINVAL;
+ } else if (!hva) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
+ (unsigned long)hva, PAGE_SIZE);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!r && kvm->arch.xen.shinfo_cache.active)
+ r = kvm_xen_shared_info_init(kvm);
+
mutex_unlock(&kvm->arch.xen.xen_lock);
break;
-
+ }
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
@@ -699,13 +813,21 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (kvm->arch.xen.shinfo_cache.active)
+ if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
+ data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
+ else
+ data->u.shared_info.hva = 0;
+ r = 0;
+ break;
+
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
data->u.vector = kvm->arch.xen.upcall_vector;
r = 0;
@@ -742,20 +864,33 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
/* No compat necessary here. */
BUILD_BUG_ON(sizeof(struct vcpu_info) !=
sizeof(struct compat_vcpu_info));
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
offsetof(struct compat_vcpu_info, time));
- if (data->u.gpa == KVM_XEN_INVALID_GPA) {
- kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
- r = 0;
- break;
+ if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
+ } else {
+ if (data->u.hva == 0) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.hva, sizeof(struct vcpu_info));
}
- r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa, sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -944,9 +1079,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
/* Start the timer if the new value has a valid vector+expiry. */
if (data->u.timer.port && data->u.timer.expires_ns)
- kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
- data->u.timer.expires_ns -
- get_kvmclock_ns(vcpu->kvm));
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
r = 0;
break;
@@ -977,13 +1110,21 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
- if (vcpu->arch.xen.vcpu_info_cache.active)
+ if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
+ data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
+ else
+ data->u.hva = 0;
+ r = 0;
+ break;
+
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
@@ -1093,9 +1234,24 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
bool lm = is_long_mode(vcpu);
+ int r = 0;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ if (kvm->arch.xen.long_mode != lm) {
+ kvm->arch.xen.long_mode = lm;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place.
+ */
+ if (kvm->arch.xen.shinfo_cache.active &&
+ kvm_xen_shared_info_init(kvm))
+ r = 1;
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
- /* Latch long_mode for shared_info pages etc. */
- vcpu->kvm->arch.xen.long_mode = lm;
+ if (r)
+ return r;
/*
* If Xen hypercall intercept is enabled, fill the hypercall
@@ -1396,7 +1552,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
{
struct vcpu_set_singleshot_timer oneshot;
struct x86_exception e;
- s64 delta;
if