summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2022-04-08 12:43:40 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2022-04-13 13:37:17 -0400
commita4cfff3f0f8c07f1f7873a82bdeb3995807dac8c (patch)
tree3c69d468d5b4a4db2105ed3d9122cebf6c3236cf
parent42dcbe7d8bac997eef4c379e61d9121a15ed4e36 (diff)
parent8d5678a76689acbf91245a3791fe853ab773090f (diff)
downloadlinux-a4cfff3f0f8c07f1f7873a82bdeb3995807dac8c.tar.gz
linux-a4cfff3f0f8c07f1f7873a82bdeb3995807dac8c.tar.bz2
linux-a4cfff3f0f8c07f1f7873a82bdeb3995807dac8c.zip
Merge branch 'kvm-older-features' into HEAD
Merge branch for features that did not make it into 5.18: * New ioctls to get/set TSC frequency for a whole VM * Allow userspace to opt out of hypercall patching Nested virtualization improvements for AMD: * Support for "nested nested" optimizations (nested vVMLOAD/VMSAVE, nested vGIF) * Allow AVIC to co-exist with a nested guest running * Fixes for LBR virtualizations when a nested guest is running, and nested LBR virtualization support * PAUSE filtering for nested hypervisors Guest support: * Decoupling of vcpu_is_preempted from PV spinlocks Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r--Documentation/virt/kvm/api.rst149
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h34
-rw-r--r--arch/x86/include/uapi/asm/kvm.h11
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/kvm.c77
-rw-r--r--arch/x86/kvm/i8259.c1
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/irq_comm.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c18
-rw-r--r--arch/x86/kvm/svm/avic.c10
-rw-r--r--arch/x86/kvm/svm/nested.c297
-rw-r--r--arch/x86/kvm/svm/svm.c207
-rw-r--r--arch/x86/kvm/svm/svm.h53
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c211
-rw-r--r--arch/x86/kvm/xen.c1246
-rw-r--r--arch/x86/kvm/xen.h62
-rw-r--r--include/linux/kvm_host.h3
-rw-r--r--include/uapi/linux/kvm.h48
-rw-r--r--tools/testing/selftests/kvm/.gitignore1
-rw-r--r--tools/testing/selftests/kvm/Makefile2
-rw-r--r--tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c170
-rw-r--r--tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c119
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c366
25 files changed, 2543 insertions, 561 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 85c7abc51af5..e7a0dfdc0178 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -982,12 +982,22 @@ memory.
__u8 pad2[30];
};
-If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
-KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
-This requests KVM to generate the contents of the hypercall page
-automatically; hypercalls will be intercepted and passed to userspace
-through KVM_EXIT_XEN. In this case, all of the blob size and address
-fields must be zero.
+If certain flags are returned from the KVM_CAP_XEN_HVM check, they may
+be set in the flags field of this ioctl:
+
+The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
+the contents of the hypercall page automatically; hypercalls will be
+intercepted and passed to userspace through KVM_EXIT_XEN. In this
+ase, all of the blob size and address fields must be zero.
+
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
+will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
+channel interrupts rather than manipulating the guest's shared_info
+structures directly. This, in turn, may allow KVM to enable features
+such as intercepting the SCHEDOP_poll hypercall to accelerate PV
+spinlock operation for the guest. Userspace may still use the ioctl
+to deliver events if it was advertised, even if userspace does not
+send this indication that it will always do so
No other flags are currently valid in the struct kvm_xen_hvm_config.
@@ -1887,22 +1897,25 @@ the future.
4.55 KVM_SET_TSC_KHZ
--------------------
-:Capability: KVM_CAP_TSC_CONTROL
+:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
:Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
:Parameters: virtual tsc_khz
:Returns: 0 on success, -1 on error
Specifies the tsc frequency for the virtual machine. The unit of the
frequency is KHz.
+If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
+be used as a vm ioctl to set the initial tsc frequency of subsequently
+created vCPUs.
4.56 KVM_GET_TSC_KHZ
--------------------
-:Capability: KVM_CAP_GET_TSC_KHZ
+:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
:Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
:Parameters: none
:Returns: virtual tsc-khz on success, negative value on error
@@ -5216,7 +5229,25 @@ have deterministic behavior.
struct {
__u64 gfn;
} shared_info;
- __u64 pad[4];
+ struct {
+ __u32 send_port;
+ __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+ __u32 flags;
+ union {
+ struct {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ } port;
+ struct {
+ __u32 port; /* Zero for eventfd */
+ __s32 fd;
+ } eventfd;
+ __u32 padding[4];
+ } deliver;
+ } evtchn;
+ __u32 xen_version;
+ __u64 pad[8];
} u;
};
@@ -5247,6 +5278,30 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
Sets the exception vector used to deliver Xen event channel upcalls.
+ This is the HVM-wide vector injected directly by the hypervisor
+ (not through the local APIC), typically configured by a guest via
+ HVM_PARAM_CALLBACK_IRQ.
+
+KVM_XEN_ATTR_TYPE_EVTCHN
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+ an outbound port number for interception of EVTCHNOP_send requests
+ from the guest. A given sending port number may be directed back
+ to a specified vCPU (by APIC ID) / port / priority on the guest,
+ or to trigger events on an eventfd. The vCPU and priority can be
+ changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call,
+ but other fields cannot change for a given sending port. A port
+ mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags
+ field.
+
+KVM_XEN_ATTR_TYPE_XEN_VERSION
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+ the 32-bit version code returned to the guest when it invokes the
+ XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV
+ Xen guests will often use this to as a dummy hypercall to trigger
+ event channel delivery, so responding within the kernel without
+ exiting to userspace is beneficial.
4.127 KVM_XEN_HVM_GET_ATTR
--------------------------
@@ -5258,7 +5313,8 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
:Returns: 0 on success, < 0 on error
Allows Xen VM attributes to be read. For the structure and types,
-see KVM_XEN_HVM_SET_ATTR above.
+see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN
+attribute cannot be read.
4.128 KVM_XEN_VCPU_SET_ATTR
---------------------------
@@ -5285,6 +5341,13 @@ see KVM_XEN_HVM_SET_ATTR above.
__u64 time_blocked;
__u64 time_offline;
} runstate;
+ __u32 vcpu_id;
+ struct {
+ __u32 port;
+ __u32 priority;
+ __u64 expires_ns;
+ } timer;
+ __u8 vector;
} u;
};
@@ -5326,6 +5389,27 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
or RUNSTATE_offline) to set the current accounted state as of the
adjusted state_entry_time.
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen
+ vCPU ID of the given vCPU, to allow timer-related VCPU operations to
+ be intercepted by KVM.
+
+KVM_XEN_VCPU_ATTR_TYPE_TIMER
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+ event channel port/priority for the VIRQ_TIMER of the vCPU, as well
+ as allowing a pending timer to be saved/restored.
+
+KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
+ This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+ support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+ per-vCPU local APIC upcall vector, configured by a Xen guest with
+ the HVMOP_set_evtchn_upcall_vector hypercall. This is typically
+ used by Windows guests, and is distinct from the HVM-wide upcall
+ vector configured with HVM_PARAM_CALLBACK_IRQ.
+
+
4.129 KVM_XEN_VCPU_GET_ATTR
---------------------------
@@ -5645,6 +5729,25 @@ enabled with ``arch_prctl()``, but this may change in the future.
The offsets of the state save areas in struct kvm_xsave follow the contents
of CPUID leaf 0xD on the host.
+4.135 KVM_XEN_HVM_EVTCHN_SEND
+-----------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_irq_routing_xen_evtchn
+:Returns: 0 on success, < 0 on error
+
+
+::
+
+ struct kvm_irq_routing_xen_evtchn {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ };
+
+This ioctl injects an event channel interrupt directly to the guest vCPU.
5. The kvm_run structure
========================
@@ -7135,6 +7238,15 @@ The valid bits in cap.args[0] are:
Additionally, when this quirk is disabled,
KVM clears CPUID.01H:ECX[bit 3] if
IA32_MISC_ENABLE[bit 18] is cleared.
+
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN By default, KVM rewrites guest
+ VMMCALL/VMCALL instructions to match the
+ vendor's hypercall instruction for the
+ system. When this quirk is disabled, KVM
+ will no longer rewrite invalid guest
+ hypercall instructions. Executing the
+ incorrect hypercall instruction will
+ generate a #UD within the guest.
=================================== ============================================
8. Other capabilities.
@@ -7612,8 +7724,9 @@ PVHVM guests. Valid flags are::
#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
- #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 2)
- #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 3)
+ #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
+ #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
+ #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
ioctl is available, for the guest to set its hypercall page.
@@ -7637,6 +7750,14 @@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries
of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
field set to indicate 2 level event channel delivery.
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports
+injecting event channel events directly into the guest with the
+KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the
+KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
+related to event channel delivery, timers, and the XENVER_version
+interception.
+
8.31 KVM_CAP_PPC_MULTITCE
-------------------------
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3c368b639c04..96e4e9842dfc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -126,6 +126,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e0c0f0e1f754..f1dfa066068d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -607,16 +607,21 @@ struct kvm_vcpu_hv {
struct kvm_vcpu_xen {
u64 hypercall_rip;
u32 current_runstate;
- bool vcpu_info_set;
- bool vcpu_time_info_set;
- bool runstate_set;
- struct gfn_to_hva_cache vcpu_info_cache;
- struct gfn_to_hva_cache vcpu_time_info_cache;
- struct gfn_to_hva_cache runstate_cache;
+ u8 upcall_vector;
+ struct gfn_to_pfn_cache vcpu_info_cache;
+ struct gfn_to_pfn_cache vcpu_time_info_cache;
+ struct gfn_to_pfn_cache runstate_cache;
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
unsigned long evtchn_pending_sel;
+ u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+ u32 timer_virq;
+ u64 timer_expires; /* In guest epoch */
+ atomic_t timer_pending;
+ struct hrtimer timer;
+ int poll_evtchn;
+ struct timer_list poll_timer;
};
struct kvm_vcpu_arch {
@@ -753,8 +758,7 @@ struct kvm_vcpu_arch {
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
- struct gfn_to_hva_cache pv_time;
- bool pv_time_enabled;
+ struct gfn_to_pfn_cache pv_time;
/* set guest stopped flag in pvclock flags field */
bool pvclock_set_guest_stopped_request;
@@ -1024,9 +1028,12 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
+ u32 xen_version;
bool long_mode;
u8 upcall_vector;
struct gfn_to_pfn_cache shinfo_cache;
+ struct idr evtchn_ports;
+ unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
};
enum kvm_irqchip_mode {
@@ -1119,6 +1126,8 @@ struct kvm_arch {
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
+ u32 default_tsc_khz;
+
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
u64 master_kernel_ns;
@@ -1498,6 +1507,11 @@ struct kvm_x86_ops {
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+
+ /*
+ * Returns vCPU specific APICv inhibit reasons
+ */
+ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
@@ -1799,6 +1813,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
bool kvm_apicv_activated(struct kvm *kvm);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set);
@@ -1988,6 +2003,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_CD_NW_CLEARED | \
KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
- KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
+ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index bf6e96011dfe..21614807a2cb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -428,11 +428,12 @@ struct kvm_sync_regs {
struct kvm_vcpu_events events;
};
-#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
-#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
-#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
-#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
-#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index b14533af7676..9b698215d261 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,7 +5,7 @@
#include <asm/ia32.h>
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
#include <asm/kvm_para.h>
#endif
@@ -20,7 +20,7 @@ int main(void)
BLANK();
#endif
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
BLANK();
#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a22deb58f86d..d0bb2b3fb305 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -752,6 +752,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
}
#endif
+#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+bool __kvm_vcpu_is_preempted(long cpu);
+
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+ASM_ENDBR
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+ASM_RET
+".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
+".popsection");
+
+#endif
+
static void __init kvm_guest_init(void)
{
int i;
@@ -764,6 +800,9 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
static_call_update(pv_steal_clock, kvm_steal_clock);
+
+ pv_ops.lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -1005,40 +1044,6 @@ static void kvm_wait(u8 *ptr, u8 val)
}
}
-#ifdef CONFIG_X86_32
-__visible bool __kvm_vcpu_is_preempted(long cpu)
-{
- struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
-
- return !!(src->preempted & KVM_VCPU_PREEMPTED);
-}
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
-
-#else
-
-#include <asm/asm-offsets.h>
-
-extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
-
-/*
- * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
- * restoring to/from the stack.
- */
-asm(
-".pushsection .text;"
-".global __raw_callee_save___kvm_vcpu_is_preempted;"
-".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
-"__raw_callee_save___kvm_vcpu_is_preempted:"
-ASM_ENDBR
-"movq __per_cpu_offset(,%rdi,8), %rax;"
-"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
-"setne %al;"
-ASM_RET
-".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
-".popsection");
-
-#endif
-
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -1082,10 +1087,6 @@ void __init kvm_spinlock_init(void)
pv_ops.lock.wait = kvm_wait;
pv_ops.lock.kick = kvm_kick_cpu;
- if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
- pv_ops.lock.vcpu_is_preempted =
- PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
- }
/*
* When PV spinlock is enabled which is preferred over
* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index be99dc86293d..e1bb6218bb96 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
*/
irq2 = 7;
intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
} else
intno = s->pics[0].irq_base + irq;
} else {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 172b05343cfd..f371f1292ca3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -22,10 +22,14 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
+ int r = 0;
+
if (lapic_in_kernel(vcpu))
- return apic_has_pending_timer(vcpu);
+ r = apic_has_pending_timer(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ r += kvm_xen_has_pending_timer(vcpu);
- return 0;
+ return r;
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu))
kvm_inject_apic_timer_irqs(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_inject_timer_irqs(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6e0dab04320e..0687162c4f22 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -181,7 +181,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
if (!level)
return -1;
- return kvm_xen_set_evtchn_fast(e, kvm);
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
#endif
default:
break;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f9080ee50ffa..c623019929a7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1866,17 +1866,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
- if (ret < 0) {
+ if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return false;
- }
-
- return !!ret;
+ return ret;
}
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@ -1998,7 +1995,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
for_each_sp(pages, sp, parents, i) {
kvm_unlink_unsync_page(vcpu->kvm, sp);
- flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
mmu_pages_clear_parents(&parents);
}
if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
@@ -2039,6 +2036,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
+ int ret;
int collisions = 0;
LIST_HEAD(invalid_list);
@@ -2091,11 +2089,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
* If the sync fails, the page is zapped. If so, break
* in order to rebuild it.
*/
- if (!kvm_sync_page(vcpu, sp, &invalid_list))
+ ret = kvm_sync_page(vcpu, sp, &invalid_list);
+ if (ret < 0)
break;
WARN_ON(!list_empty(&invalid_list));
- kvm_flush_remote_tlbs(vcpu->kvm);
+ if (ret > 0)
+ kvm_flush_remote_tlbs(vcpu->kvm);
}
__clear_sp_write_flooding_count(sp);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 421619540ff9..9b859218af59 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -165,9 +165,8 @@ free_avic:
return err;
}
-void avic_init_vmcb(struct vcpu_svm *svm)
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
- struct vmcb *vmcb = svm->vmcb;
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
@@ -357,6 +356,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
return 1;
}
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96bab464967f..73b545278f5f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -36,40 +36,43 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
- if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
/*
* TODO: track the cause of the nested page fault, and
* correctly fill in the high bits of exit_info_1.
*/
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = (1ULL << 32);
- svm->vmcb->control.exit_info_2 = fault->address;
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
}
- svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
- svm->vmcb->control.exit_info_1 |= fault->error_code;
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
nested_svm_vmexit(svm);
}
static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON(!is_guest_mode(vcpu));
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ WARN_ON(!is_guest_mode(vcpu));
if (vmcb12_is_intercept(&svm->nested.ctl,
INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
- !svm->nested.nested_run_pending) {
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
- nested_svm_vmexit(svm);
- } else {
- kvm_inject_page_fault(vcpu, fault);
- }
+ !svm->nested.nested_run_pending) {
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = fault->error_code;
+ vmcb->control.exit_info_2 = fault->address;
+ nested_svm_vmexit(svm);
+ } else {
+ kvm_inject_page_fault(vcpu, fault);
+ }
}
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
@@ -121,6 +124,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!svm->v_vmload_vmsave_enabled)
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h;
@@ -162,8 +179,17 @@ void recalc_intercepts(struct vcpu_svm *svm)
if (!intercept_smi)
vmcb_clr_intercept(c, INTERCEPT_SMI);
- vmcb_set_intercept(c, INTERCEPT_VMLOAD);
- vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
}
/*
@@ -413,6 +439,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
*/
mask &= ~V_IRQ_MASK;
}
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
svm->nested.ctl.int_ctl &= ~mask;
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
}
@@ -454,11 +484,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
vmcb12->control.exit_int_info = exit_int_info;
}
-static inline bool nested_npt_enabled(struct vcpu_svm *svm)
-{
- return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
-}
-
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
/*
@@ -515,6 +540,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
nested_vmcb02_compute_g_pat(svm);
@@ -526,18 +553,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
- svm->vmcb->save.es = vmcb12->save.es;
- svm->vmcb->save.cs = vmcb12->save.cs;
- svm->vmcb->save.ss = vmcb12->save.ss;
- svm->vmcb->save.ds = vmcb12->save.ds;
- svm->vmcb->save.cpl = vmcb12->save.cpl;
- vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
- svm->vmcb->sav