summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2023-08-31 13:36:33 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2023-08-31 13:36:33 -0400
commit6d5e3c318a33edb1f9176964c4ed7f076fc4248c (patch)
tree0af8c82dd29310f3d8b8bceb9107227b3b03495f
parentbd7fe98b353b7e52b3db239d86cbe04234097a20 (diff)
parent9717efbe5ba3f52d4b3cc637c7f7f6149ea264bb (diff)
downloadlinux-6d5e3c318a33edb1f9176964c4ed7f076fc4248c.tar.gz
linux-6d5e3c318a33edb1f9176964c4ed7f076fc4248c.tar.bz2
linux-6d5e3c318a33edb1f9176964c4ed7f076fc4248c.zip
Merge tag 'kvm-x86-misc-6.6' of https://github.com/kvm-x86/linux into HEAD
KVM x86 changes for 6.6: - Misc cleanups - Retry APIC optimized recalculation if a vCPU is added/enabled - Overhaul emergency reboot code to bring SVM up to par with VMX, tie the "emergency disabling" behavior to KVM actually being loaded, and move all of the logic within KVM - Fix user triggerable WARNs in SVM where KVM incorrectly assumes the TSC ratio MSR can diverge from the default iff TSC scaling is enabled, and clean up related code - Add a framework to allow "caching" feature flags so that KVM can check if the guest can use a feature without needing to search guest CPUID
-rw-r--r--arch/x86/include/asm/kexec.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h24
-rw-r--r--arch/x86/include/asm/reboot.h7
-rw-r--r--arch/x86/include/asm/virtext.h154
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/kernel/crash.c31
-rw-r--r--arch/x86/kernel/reboot.c66
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/cpuid.c40
-rw-r--r--arch/x86/kvm/cpuid.h46
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/governed_features.h21
-rw-r--r--arch/x86/kvm/hyperv.c1
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/lapic.c29
-rw-r--r--arch/x86/kvm/mmu/mmu.c22
-rw-r--r--arch/x86/kvm/reverse_cpuid.h1
-rw-r--r--arch/x86/kvm/svm/nested.c57
-rw-r--r--arch/x86/kvm/svm/svm.c150
-rw-r--r--arch/x86/kvm/svm/svm.h18
-rw-r--r--arch/x86/kvm/vmx/capabilities.h2
-rw-r--r--arch/x86/kvm/vmx/hyperv.c2
-rw-r--r--arch/x86/kvm/vmx/nested.c13
-rw-r--r--arch/x86/kvm/vmx/nested.h2
-rw-r--r--arch/x86/kvm/vmx/vmx.c190
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c46
-rw-r--r--arch/x86/kvm/x86.h1
28 files changed, 490 insertions, 445 deletions
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 5b77bbc28f96..819046974b99 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -205,8 +205,6 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image);
#endif
#endif
-typedef void crash_vmclear_fn(void);
-extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
extern void kdump_nmi_shootdown_cpus(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6523f5494cb2..e3c9ff4146fc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -746,7 +746,6 @@ struct kvm_vcpu_arch {
u64 smi_count;
bool at_instruction_boundary;
bool tpr_access_reporting;
- bool xsaves_enabled;
bool xfd_no_write_intercept;
u64 ia32_xss;
u64 microcode_version;
@@ -831,6 +830,25 @@ struct kvm_vcpu_arch {
struct kvm_cpuid_entry2 *cpuid_entries;
struct kvm_hypervisor_cpuid kvm_cpuid;
+ /*
+ * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly
+ * when "struct kvm_vcpu_arch" is no longer defined in an
+ * arch/x86/include/asm header. The max is mostly arbitrary, i.e.
+ * can be increased as necessary.
+ */
+#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG
+
+ /*
+ * Track whether or not the guest is allowed to use features that are
+ * governed by KVM, where "governed" means KVM needs to manage state
+ * and/or explicitly enable the feature in hardware. Typically, but
+ * not always, governed features can be used by the guest if and only
+ * if both KVM and userspace want to expose the feature to the guest.
+ */
+ struct {
+ DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES);
+ } governed_features;
+
u64 reserved_gpa_bits;
int maxphyaddr;
@@ -1655,8 +1673,8 @@ struct kvm_x86_ops {
u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
- void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
- void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu);
+ void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
/*
* Retrieve somewhat arbitrary exit information. Intended to
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 9177b4354c3f..6536873f8fc0 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,7 +25,14 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
+typedef void (cpu_emergency_virt_cb)(void);
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_disable_virtualization(void);
+#else
+static inline void cpu_emergency_disable_virtualization(void) {}
+#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
deleted file mode 100644
index 3b12e6b99412..000000000000
--- a/arch/x86/include/asm/virtext.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* CPU virtualization extensions handling
- *
- * This should carry the code for handling CPU virtualization extensions
- * that needs to live in the kernel core.
- *
- * Author: Eduardo Habkost <ehabkost@redhat.com>
- *
- * Copyright (C) 2008, Red Hat Inc.
- *
- * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
- */
-#ifndef _ASM_X86_VIRTEX_H
-#define _ASM_X86_VIRTEX_H
-
-#include <asm/processor.h>
-
-#include <asm/vmx.h>
-#include <asm/svm.h>
-#include <asm/tlbflush.h>
-
-/*
- * VMX functions:
- */
-
-static inline int cpu_has_vmx(void)
-{
- unsigned long ecx = cpuid_ecx(1);
- return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
-}
-
-
-/**
- * cpu_vmxoff() - Disable VMX on the current CPU
- *
- * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
- *
- * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
- * atomically track post-VMXON state, e.g. this may be called in NMI context.
- * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
- * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
- * magically in RM, VM86, compat mode, or at CPL>0.
- */
-static inline int cpu_vmxoff(void)
-{
- asm_volatile_goto("1: vmxoff\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- ::: "cc", "memory" : fault);
-
- cr4_clear_bits(X86_CR4_VMXE);
- return 0;
-
-fault:
- cr4_clear_bits(X86_CR4_VMXE);
- return -EIO;
-}
-
-static inline int cpu_vmx_enabled(void)
-{
- return __read_cr4() & X86_CR4_VMXE;
-}
-
-/** Disable VMX if it is enabled on the current CPU
- *
- * You shouldn't call this if cpu_has_vmx() returns 0.
- */
-static inline void __cpu_emergency_vmxoff(void)
-{
- if (cpu_vmx_enabled())
- cpu_vmxoff();
-}
-
-/** Disable VMX if it is supported and enabled on the current CPU
- */
-static inline void cpu_emergency_vmxoff(void)
-{
- if (cpu_has_vmx())
- __cpu_emergency_vmxoff();
-}
-
-
-
-
-/*
- * SVM functions:
- */
-
-/** Check if the CPU has SVM support
- *
- * You can use the 'msg' arg to get a message describing the problem,
- * if the function returns zero. Simply pass NULL if you are not interested
- * on the messages; gcc should take care of not generating code for
- * the messages on this case.
- */
-static inline int cpu_has_svm(const char **msg)
-{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) {
- if (msg)
- *msg = "not amd or hygon";
- return 0;
- }
-
- if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
- if (msg)
- *msg = "can't execute cpuid_8000000a";
- return 0;
- }
-
- if (!boot_cpu_has(X86_FEATURE_SVM)) {
- if (msg)
- *msg = "svm not available";
- return 0;
- }
- return 1;
-}
-
-
-/** Disable SVM on the current CPU
- *
- * You should call this only if cpu_has_svm() returned true.
- */
-static inline void cpu_svm_disable(void)
-{
- uint64_t efer;
-
- wrmsrl(MSR_VM_HSAVE_PA, 0);
- rdmsrl(MSR_EFER, efer);
- if (efer & EFER_SVME) {
- /*
- * Force GIF=1 prior to disabling SVM to ensure INIT and NMI
- * aren't blocked, e.g. if a fatal error occurred between CLGI
- * and STGI. Note, STGI may #UD if SVM is disabled from NMI
- * context between reading EFER and executing STGI. In that
- * case, GIF must already be set, otherwise the NMI would have
- * been blocked, so just eat the fault.
- */
- asm_volatile_goto("1: stgi\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- ::: "memory" : fault);
-fault:
- wrmsrl(MSR_EFER, efer & ~EFER_SVME);
- }
-}
-
-/** Makes sure SVM is disabled, if it is supported on the CPU
- */
-static inline void cpu_emergency_svm_disable(void)
-{
- if (cpu_has_svm(NULL))
- cpu_svm_disable();
-}
-
-#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0d02c4aafa6f..0e73616b82f3 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -71,7 +71,7 @@
#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
-#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES)
+#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index cdd92ab43cda..54cd959cb316 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -48,27 +48,6 @@ struct crash_memmap_data {
unsigned int type;
};
-/*
- * This is used to VMCLEAR all VMCSs loaded on the
- * processor. And when loading kvm_intel module, the
- * callback function pointer will be assigned.
- *
- * protected by rcu.
- */
-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
-
-static inline void cpu_crash_vmclear_loaded_vmcss(void)
-{
- crash_vmclear_fn *do_vmclear_operation = NULL;
-
- rcu_read_lock();
- do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
- if (do_vmclear_operation)
- do_vmclear_operation();
- rcu_read_unlock();
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -76,11 +55,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
crash_save_cpu(regs, cpu);
/*
- * VMCLEAR VMCSs loaded on all cpus if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
- /*
* Disable Intel PT to stop its logging
*/
cpu_emergency_stop_pt();
@@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
crash_smp_send_stop();
- /*
- * VMCLEAR VMCSs loaded on this cpu if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
cpu_emergency_disable_virtualization();
/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 3adbe97015c1..830425e6d38e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -22,7 +22,6 @@
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
#include <asm/pci_x86.h>
-#include <asm/virtext.h>
#include <asm/cpu.h>
#include <asm/nmi.h>
#include <asm/smp.h>
@@ -530,9 +529,54 @@ static inline void kb_wait(void)
static inline void nmi_shootdown_cpus_on_restart(void);
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
+/* RCU-protected callback to disable virtualization prior to reboot. */
+static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
+
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, callback);
+}
+EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_virt_cb *callback;
+
+ /*
+ * IRQs must be disabled as KVM enables virtualization in hardware via
+ * function call IPIs, i.e. IRQs need to be disabled to guarantee
+ * virtualization stays disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
+ rcu_read_lock();
+ callback = rcu_dereference(cpu_emergency_virt_callback);
+ if (callback)
+ callback();
+ rcu_read_unlock();
+}
+
static void emergency_reboot_disable_virtualization(void)
{
- /* Just make sure we won't change CPUs while doing this */
local_irq_disable();
/*
@@ -545,7 +589,7 @@ static void emergency_reboot_disable_virtualization(void)
* Do the NMI shootdown even if virtualization is off on _this_ CPU, as
* other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx() || cpu_has_svm(NULL)) {
+ if (rcu_access_pointer(cpu_emergency_virt_callback)) {
/* Safely force _this_ CPU out of VMX/SVM operation. */
cpu_emergency_disable_virtualization();
@@ -553,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void)
nmi_shootdown_cpus_on_restart();
}
}
-
+#else
+static void emergency_reboot_disable_virtualization(void) { }
+#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
void __attribute__((weak)) mach_reboot_fixups(void)
{
@@ -787,21 +833,9 @@ void machine_crash_shutdown(struct pt_regs *regs)
}
#endif
-
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
-/*
- * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
- * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
- * GIF=0, i.e. if the crash occurred between CLGI and STGI.
- */
-void cpu_emergency_disable_virtualization(void)
-{
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-}
-
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 89ca7f4c1464..66dbd1f4d57d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -101,7 +101,7 @@ config X86_SGX_KVM
config KVM_AMD
tristate "KVM for AMD processors support"
- depends on KVM
+ depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON)
help
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index d3432687c9e6..0544e30b4946 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include "linux/lockdep.h"
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -84,6 +85,18 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *e;
int i;
+ /*
+ * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+ * with IRQs disabled is disallowed. The CPUID model can legitimately
+ * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+ * typically disabled in KVM only when KVM is in a performance critical
+ * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
+ * if this rule is violated, this assertion is purely to flag potential
+ * performance issues. If this fires, consider moving the lookup out
+ * of the hotpath, e.g. by caching information during CPUID updates.
+ */
+ lockdep_assert_irqs_enabled();
+
for (i = 0; i < nent; i++) {
e = &entries[i];
@@ -312,6 +325,27 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ bool allow_gbpages;
+
+ BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES);
+ bitmap_zero(vcpu->arch.governed_features.enabled,
+ KVM_MAX_NR_GOVERNED_FEATURES);
+
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ * If TDP is disabled, honor *only* guest CPUID as KVM has full control
+ * and can install smaller shadow pages if the host lacks 1GiB support.
+ */
+ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+ if (allow_gbpages)
+ kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES);
best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
@@ -647,7 +681,8 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
- F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
+ F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
+ F(AMX_COMPLEX)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
@@ -1154,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
cpuid_entry_override(entry, CPUID_8000_0001_ECX);
break;
+ case 0x80000005:
+ /* Pass host L1 cache and TLB info. */
+ break;
case 0x80000006:
/* Drop reserved bits, pass host L2 cache and TLB info. */
entry->edx &= ~GENMASK(17, 16);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b1658c0de847..284fa4704553 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -232,4 +232,50 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
}
+enum kvm_governed_features {
+#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x,
+#include "governed_features.h"
+ KVM_NR_GOVERNED_FEATURES
+};
+
+static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
+{
+ switch (x86_feature) {
+#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
+#include "governed_features.h"
+ default:
+ return -1;
+ }
+}
+
+static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature)
+{
+ return kvm_governed_feature_index(x86_feature) >= 0;
+}
+
+static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+
+ __set_bit(kvm_governed_feature_index(x86_feature),
+ vcpu->arch.governed_features.enabled);
+}
+
+static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature))
+ kvm_governed_feature_set(vcpu, x86_feature);
+}
+
+static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+
+ return test_bit(kvm_governed_feature_index(x86_feature),
+ vcpu->arch.governed_features.enabled);
+}
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 936a397a08cd..2673cd5c46cb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1799,13 +1799,11 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->addr.mem,
&op->val,
op->bytes);
- break;
case OP_MEM_STR:
return segmented_write(ctxt,
op->addr.mem,
op->data,
op->bytes * op->count);
- break;
case OP_XMM:
kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
break;
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
new file mode 100644
index 000000000000..423a73395c10
--- /dev/null
+++ b/arch/x86/kvm/governed_features.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE)
+BUILD_BUG()
+#endif
+
+#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
+
+KVM_GOVERNED_X86_FEATURE(GBPAGES)
+KVM_GOVERNED_X86_FEATURE(XSAVES)
+KVM_GOVERNED_X86_FEATURE(VMX)
+KVM_GOVERNED_X86_FEATURE(NRIPS)
+KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
+KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
+KVM_GOVERNED_X86_FEATURE(LBRV)
+KVM_GOVERNED_X86_FEATURE(PAUSEFILTER)
+KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD)
+KVM_GOVERNED_X86_FEATURE(VGIF)
+KVM_GOVERNED_X86_FEATURE(VNMI)
+
+#undef KVM_GOVERNED_X86_FEATURE
+#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index b28fd020066f..7c2dac6824e2 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1293,7 +1293,6 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
case HV_X64_MSR_VP_ASSIST_PAGE:
return hv_vcpu->cpuid_cache.features_eax &
HV_MSR_APIC_ACCESS_AVAILABLE;
- break;
case HV_X64_MSR_TSC_FREQUENCY:
case HV_X64_MSR_APIC_FREQUENCY:
return hv_vcpu->cpuid_cache.features_eax &
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index ab65f3a47dfd..be7aeb9b8ea3 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -213,7 +213,6 @@ struct x86_emulate_ops {
bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
- bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a983a16163b1..dcd60b39e794 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -376,7 +376,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
- bool xapic_id_mismatch = false;
+ bool xapic_id_mismatch;
+ int r;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
@@ -386,9 +387,14 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
"Dirty APIC map without an in-kernel local APIC");
mutex_lock(&kvm->arch.apic_map_lock);
+
+retry:
/*
- * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
- * (if clean) or the APIC registers (if dirty).
+ * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+ * or the APIC registers (if dirty). Note, on retry the map may have
+ * not yet been marked dirty by whatever task changed a vCPU's x2APIC
+ * ID, i.e. the map may still show up as in-progress. In that case
+ * this task still needs to retry and complete its calculation.
*/
if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
@@ -397,6 +403,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
return;
}
+ /*
+ * Reset the mismatch flag between attempts so that KVM does the right
+ * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
+ * keep max_id strictly increasing. Disallowing max_id from shrinking
+ * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
+ * with the highest x2APIC ID is toggling its APIC on and off.
+ */
+ xapic_id_mismatch = false;
+
kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
@@ -415,9 +430,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
if (!kvm_apic_present(vcpu))
continue;
- if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+ r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
+ if (r) {
kvfree(new);
new = NULL;
+ if (r == -E2BIG) {
+ cond_resched();
+ goto retry;
+ }
+
goto out;
}
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7b52e31f1151..276157f8496c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4804,28 +4804,13 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
}
}
-static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
-{
- /*
- * If TDP is enabled, let the guest use GBPAGES if they're supported in
- * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
- * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
- * walk for performance and complexity reasons. Not to mention KVM
- * _can't_ solve the problem because GVA->GPA walks aren't visible to
- * KVM once a TDP translation is installed. Mimic hardware behavior so
- * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
- */
- return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
-}
-
static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
context->cpu_role.base.level, is_efer_nx(context),
- guest_can_use_gbpages(vcpu),
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
}
@@ -4902,7 +4887,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
- guest_can_use_gbpages(vcpu), is_pse, is_amd);
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ is_pse, is_amd);
if (!shadow_me_mask)
return;
@@ -6844,7 +6830,7 @@ static void mmu_destroy_caches(void)
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
{
if (nx_hugepage_mitigation_hard_disabled)
- return sprintf(buffer, "never\n");
+ return sysfs_emit(buffer, "never\n");
return param_get_bool(buffer, kp);
}
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 56cbdb24400a..b81650678375 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -43,6 +43,7 @@ enum kvm_only_cpuid_leafs {
/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
/* CPUID level 0x80000007 (EDX). */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96936ddf1b3c..dd496c9e5f91 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
{
- if (!svm->v_vmload_vmsave_enabled)
+ if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
return true;
if (!nested_npt_enabled(svm))
@@ -552,6 +552,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
bool new_vmcb12 = false;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
nested_vmcb02_compute_g_pat(svm);
@@ -577,