diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2023-08-31 13:36:33 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2023-08-31 13:36:33 -0400 |
commit | 6d5e3c318a33edb1f9176964c4ed7f076fc4248c (patch) | |
tree | 0af8c82dd29310f3d8b8bceb9107227b3b03495f | |
parent | bd7fe98b353b7e52b3db239d86cbe04234097a20 (diff) | |
parent | 9717efbe5ba3f52d4b3cc637c7f7f6149ea264bb (diff) | |
download | linux-6d5e3c318a33edb1f9176964c4ed7f076fc4248c.tar.gz linux-6d5e3c318a33edb1f9176964c4ed7f076fc4248c.tar.bz2 linux-6d5e3c318a33edb1f9176964c4ed7f076fc4248c.zip |
Merge tag 'kvm-x86-misc-6.6' of https://github.com/kvm-x86/linux into HEAD
KVM x86 changes for 6.6:
- Misc cleanups
- Retry APIC optimized recalculation if a vCPU is added/enabled
- Overhaul emergency reboot code to bring SVM up to par with VMX, tie the
"emergency disabling" behavior to KVM actually being loaded, and move all of
the logic within KVM
- Fix user triggerable WARNs in SVM where KVM incorrectly assumes the TSC
ratio MSR can diverge from the default iff TSC scaling is enabled, and clean
up related code
- Add a framework to allow "caching" feature flags so that KVM can check if
the guest can use a feature without needing to search guest CPUID
28 files changed, 490 insertions, 445 deletions
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5b77bbc28f96..819046974b99 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -205,8 +205,6 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); #endif #endif -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6523f5494cb2..e3c9ff4146fc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -746,7 +746,6 @@ struct kvm_vcpu_arch { u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; - bool xsaves_enabled; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; @@ -831,6 +830,25 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + /* + * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly + * when "struct kvm_vcpu_arch" is no longer defined in an + * arch/x86/include/asm header. The max is mostly arbitrary, i.e. + * can be increased as necessary. + */ +#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG + + /* + * Track whether or not the guest is allowed to use features that are + * governed by KVM, where "governed" means KVM needs to manage state + * and/or explicitly enable the feature in hardware. Typically, but + * not always, governed features can be used by the guest if and only + * if both KVM and userspace want to expose the feature to the guest. + */ + struct { + DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES); + } governed_features; + u64 reserved_gpa_bits; int maxphyaddr; @@ -1655,8 +1673,8 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit information. Intended to diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 9177b4354c3f..6536873f8fc0 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,7 +25,14 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +typedef void (cpu_emergency_virt_cb)(void); +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); +#else +static inline void cpu_emergency_disable_virtualization(void) {} +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h deleted file mode 100644 index 3b12e6b99412..000000000000 --- a/arch/x86/include/asm/virtext.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* CPU virtualization extensions handling - * - * This should carry the code for handling CPU virtualization extensions - * that needs to live in the kernel core. - * - * Author: Eduardo Habkost <ehabkost@redhat.com> - * - * Copyright (C) 2008, Red Hat Inc. - * - * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - */ -#ifndef _ASM_X86_VIRTEX_H -#define _ASM_X86_VIRTEX_H - -#include <asm/processor.h> - -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/tlbflush.h> - -/* - * VMX functions: - */ - -static inline int cpu_has_vmx(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - - -/** - * cpu_vmxoff() - Disable VMX on the current CPU - * - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static inline int cpu_vmxoff(void) -{ - asm_volatile_goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - -static inline int cpu_vmx_enabled(void) -{ - return __read_cr4() & X86_CR4_VMXE; -} - -/** Disable VMX if it is enabled on the current CPU - * - * You shouldn't call this if cpu_has_vmx() returns 0. - */ -static inline void __cpu_emergency_vmxoff(void) -{ - if (cpu_vmx_enabled()) - cpu_vmxoff(); -} - -/** Disable VMX if it is supported and enabled on the current CPU - */ -static inline void cpu_emergency_vmxoff(void) -{ - if (cpu_has_vmx()) - __cpu_emergency_vmxoff(); -} - - - - -/* - * SVM functions: - */ - -/** Check if the CPU has SVM support - * - * You can use the 'msg' arg to get a message describing the problem, - * if the function returns zero. Simply pass NULL if you are not interested - * on the messages; gcc should take care of not generating code for - * the messages on this case. - */ -static inline int cpu_has_svm(const char **msg) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { - if (msg) - *msg = "not amd or hygon"; - return 0; - } - - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - - if (!boot_cpu_has(X86_FEATURE_SVM)) { - if (msg) - *msg = "svm not available"; - return 0; - } - return 1; -} - - -/** Disable SVM on the current CPU - * - * You should call this only if cpu_has_svm() returned true. - */ -static inline void cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - if (efer & EFER_SVME) { - /* - * Force GIF=1 prior to disabling SVM to ensure INIT and NMI - * aren't blocked, e.g. if a fatal error occurred between CLGI - * and STGI. Note, STGI may #UD if SVM is disabled from NMI - * context between reading EFER and executing STGI. In that - * case, GIF must already be set, otherwise the NMI would have - * been blocked, so just eat the fault. - */ - asm_volatile_goto("1: stgi\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "memory" : fault); -fault: - wrmsrl(MSR_EFER, efer & ~EFER_SVME); - } -} - -/** Makes sure SVM is disabled, if it is supported on the CPU - */ -static inline void cpu_emergency_svm_disable(void) -{ - if (cpu_has_svm(NULL)) - cpu_svm_disable(); -} - -#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0d02c4aafa6f..0e73616b82f3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,7 +71,7 @@ #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) -#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) +#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index cdd92ab43cda..54cd959cb316 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -48,27 +48,6 @@ struct crash_memmap_data { unsigned int type; }; -/* - * This is used to VMCLEAR all VMCSs loaded on the - * processor. And when loading kvm_intel module, the - * callback function pointer will be assigned. - * - * protected by rcu. - */ -crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; -EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); - -static inline void cpu_crash_vmclear_loaded_vmcss(void) -{ - crash_vmclear_fn *do_vmclear_operation = NULL; - - rcu_read_lock(); - do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); - if (do_vmclear_operation) - do_vmclear_operation(); - rcu_read_unlock(); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -76,11 +55,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) crash_save_cpu(regs, cpu); /* - * VMCLEAR VMCSs loaded on all cpus if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - - /* * Disable Intel PT to stop its logging */ cpu_emergency_stop_pt(); @@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); - /* - * VMCLEAR VMCSs loaded on this cpu if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - cpu_emergency_disable_virtualization(); /* diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3adbe97015c1..830425e6d38e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -22,7 +22,6 @@ #include <asm/reboot_fixups.h> #include <asm/reboot.h> #include <asm/pci_x86.h> -#include <asm/virtext.h> #include <asm/cpu.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -530,9 +529,54 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +/* RCU-protected callback to disable virtualization prior to reboot. */ +static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, callback); +} +EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); + +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_virt_cb *callback; + + /* + * IRQs must be disabled as KVM enables virtualization in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + + rcu_read_lock(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); + rcu_read_unlock(); +} + static void emergency_reboot_disable_virtualization(void) { - /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* @@ -545,7 +589,7 @@ static void emergency_reboot_disable_virtualization(void) * Do the NMI shootdown even if virtualization is off on _this_ CPU, as * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx() || cpu_has_svm(NULL)) { + if (rcu_access_pointer(cpu_emergency_virt_callback)) { /* Safely force _this_ CPU out of VMX/SVM operation. */ cpu_emergency_disable_virtualization(); @@ -553,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void) nmi_shootdown_cpus_on_restart(); } } - +#else +static void emergency_reboot_disable_virtualization(void) { } +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ void __attribute__((weak)) mach_reboot_fixups(void) { @@ -787,21 +833,9 @@ void machine_crash_shutdown(struct pt_regs *regs) } #endif - /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; -/* - * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during - * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if - * GIF=0, i.e. if the crash occurred between CLGI and STGI. - */ -void cpu_emergency_disable_virtualization(void) -{ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); -} - #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 89ca7f4c1464..66dbd1f4d57d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -101,7 +101,7 @@ config X86_SGX_KVM config KVM_AMD tristate "KVM for AMD processors support" - depends on KVM + depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) help Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d3432687c9e6..0544e30b4946 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> +#include "linux/lockdep.h" #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -84,6 +85,18 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *e; int i; + /* + * KVM has a semi-arbitrary rule that querying the guest's CPUID model + * with IRQs disabled is disallowed. The CPUID model can legitimately + * have over one hundred entries, i.e. the lookup is slow, and IRQs are + * typically disabled in KVM only when KVM is in a performance critical + * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break + * if this rule is violated, this assertion is purely to flag potential + * performance issues. If this fires, consider moving the lookup out + * of the hotpath, e.g. by caching information during CPUID updates. + */ + lockdep_assert_irqs_enabled(); + for (i = 0; i < nent; i++) { e = &entries[i]; @@ -312,6 +325,27 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + bool allow_gbpages; + + BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); + bitmap_zero(vcpu->arch.governed_features.enabled, + KVM_MAX_NR_GOVERNED_FEATURES); + + /* + * If TDP is enabled, let the guest use GBPAGES if they're supported in + * hardware. The hardware page walker doesn't let KVM disable GBPAGES, + * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA + * walk for performance and complexity reasons. Not to mention KVM + * _can't_ solve the problem because GVA->GPA walks aren't visible to + * KVM once a TDP translation is installed. Mimic hardware behavior so + * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. + * If TDP is disabled, honor *only* guest CPUID as KVM has full control + * and can install smaller shadow pages if the host lacks 1GiB support. + */ + allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); + if (allow_gbpages) + kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES); best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { @@ -647,7 +681,8 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | + F(AMX_COMPLEX) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, @@ -1154,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; + case 0x80000005: + /* Pass host L1 cache and TLB info. */ + break; case 0x80000006: /* Drop reserved bits, pass host L2 cache and TLB info. */ entry->edx &= ~GENMASK(17, 16); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b1658c0de847..284fa4704553 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -232,4 +232,50 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } +enum kvm_governed_features { +#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, +#include "governed_features.h" + KVM_NR_GOVERNED_FEATURES +}; + +static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) +{ + switch (x86_feature) { +#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; +#include "governed_features.h" + default: + return -1; + } +} + +static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) +{ + return kvm_governed_feature_index(x86_feature) >= 0; +} + +static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + __set_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + +static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) + kvm_governed_feature_set(vcpu, x86_feature); +} + +static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + return test_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 936a397a08cd..2673cd5c46cb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1799,13 +1799,11 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->addr.mem, &op->val, op->bytes); - break; case OP_MEM_STR: return segmented_write(ctxt, op->addr.mem, op->data, op->bytes * op->count); - break; case OP_XMM: kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h new file mode 100644 index 000000000000..423a73395c10 --- /dev/null +++ b/arch/x86/kvm/governed_features.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE) +BUILD_BUG() +#endif + +#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) + +KVM_GOVERNED_X86_FEATURE(GBPAGES) +KVM_GOVERNED_X86_FEATURE(XSAVES) +KVM_GOVERNED_X86_FEATURE(VMX) +KVM_GOVERNED_X86_FEATURE(NRIPS) +KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) +KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) +KVM_GOVERNED_X86_FEATURE(LBRV) +KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) +KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) +KVM_GOVERNED_X86_FEATURE(VGIF) +KVM_GOVERNED_X86_FEATURE(VNMI) + +#undef KVM_GOVERNED_X86_FEATURE +#undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b28fd020066f..7c2dac6824e2 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1293,7 +1293,6 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; - break; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index ab65f3a47dfd..be7aeb9b8ea3 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -213,7 +213,6 @@ struct x86_emulate_ops { bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); - bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a983a16163b1..dcd60b39e794 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -376,7 +376,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ - bool xapic_id_mismatch = false; + bool xapic_id_mismatch; + int r; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) @@ -386,9 +387,14 @@ void kvm_recalculate_apic_map(struct kvm *kvm) "Dirty APIC map without an in-kernel local APIC"); mutex_lock(&kvm->arch.apic_map_lock); + +retry: /* - * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map - * (if clean) or the APIC registers (if dirty). + * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean) + * or the APIC registers (if dirty). Note, on retry the map may have + * not yet been marked dirty by whatever task changed a vCPU's x2APIC + * ID, i.e. the map may still show up as in-progress. In that case + * this task still needs to retry and complete its calculation. */ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty, DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { @@ -397,6 +403,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) return; } + /* + * Reset the mismatch flag between attempts so that KVM does the right + * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e. + * keep max_id strictly increasing. Disallowing max_id from shrinking + * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU + * with the highest x2APIC ID is toggling its APIC on and off. + */ + xapic_id_mismatch = false; + kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); @@ -415,9 +430,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (!kvm_apic_present(vcpu)) continue; - if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) { + r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch); + if (r) { kvfree(new); new = NULL; + if (r == -E2BIG) { + cond_resched(); + goto retry; + } + goto out; } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7b52e31f1151..276157f8496c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4804,28 +4804,13 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, } } -static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) -{ - /* - * If TDP is enabled, let the guest use GBPAGES if they're supported in - * hardware. The hardware page walker doesn't let KVM disable GBPAGES, - * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA - * walk for performance and complexity reasons. Not to mention KVM - * _can't_ solve the problem because GVA->GPA walks aren't visible to - * KVM once a TDP translation is installed. Mimic hardware behavior so - * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. - */ - return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); -} - static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), - guest_can_use_gbpages(vcpu), + guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } @@ -4902,7 +4887,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, - guest_can_use_gbpages(vcpu), is_pse, is_amd); + guest_can_use(vcpu, X86_FEATURE_GBPAGES), + is_pse, is_amd); if (!shadow_me_mask) return; @@ -6844,7 +6830,7 @@ static void mmu_destroy_caches(void) static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) - return sprintf(buffer, "never\n"); + return sysfs_emit(buffer, "never\n"); return param_get_bool(buffer, kp); } diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 56cbdb24400a..b81650678375 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -43,6 +43,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) +#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) /* CPUID level 0x80000007 (EDX). */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 96936ddf1b3c..dd496c9e5f91 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!svm->v_vmload_vmsave_enabled) + if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -552,6 +552,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 bool new_vmcb12 = false; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; nested_vmcb02_compute_g_pat(svm); @@ -577, |