diff options
Diffstat (limited to 'arch/x86')
35 files changed, 1282 insertions, 831 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fd03cefabd34..9a2849527dd7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -803,6 +803,7 @@ config KVM_GUEST depends on PARAVIRT select PARAVIRT_CLOCK select ARCH_CPUIDLE_HALTPOLL + select X86_HV_CALLBACK_VECTOR default y help This option enables various optimizations for running under the KVM diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 07533795b8d2..275e7fd20310 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -67,12 +67,12 @@ static inline void kvm_set_cpu_l1tf_flush_l1d(void) __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } -static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); } -static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) { return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index ff198fc2495e..a43366191212 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -632,6 +632,10 @@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); #endif +#ifdef CONFIG_KVM_GUEST +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index be5363b21540..5ab3af7275d8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -193,8 +193,6 @@ struct x86_exception; enum x86_intercept; enum x86_intercept_stage; -#define KVM_NR_MEM_OBJS 40 - #define KVM_NR_DB_REGS 4 #define DR6_BD (1 << 13) @@ -246,15 +244,6 @@ enum x86_intercept_stage; struct kvm_kernel_irq_routing_entry; /* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - -/* * the pages used as guest page table on soft mmu are tracked by * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used * by indirect shadow page can not be more than 15 bits. @@ -322,43 +311,6 @@ struct kvm_rmap_head { unsigned long val; }; -struct kvm_mmu_page { - struct list_head link; - struct hlist_node hash_link; - struct list_head lpage_disallowed_link; - - bool unsync; - u8 mmu_valid_gen; - bool mmio_cached; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - union kvm_mmu_page_role role; - gfn_t gfn; - - u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; - int root_count; /* Currently serving as active root */ - unsigned int unsync_children; - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - DECLARE_BITMAP(unsync_child_bitmap, 512); - -#ifdef CONFIG_X86_32 - /* - * Used out of the mmu-lock to avoid reading spte values while an - * update is in progress; see the comments in __get_spte_lockless(). - */ - int clear_spte_count; -#endif - - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; -}; - struct kvm_pio_request { unsigned long linear_rip; unsigned long count; @@ -384,6 +336,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +struct kvm_mmu_page; + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -580,6 +534,7 @@ struct kvm_vcpu_arch { unsigned long cr3; unsigned long cr4; unsigned long cr4_guest_owned_bits; + unsigned long cr4_guest_rsvd_bits; unsigned long cr8; u32 host_pkru; u32 pkru; @@ -635,7 +590,8 @@ struct kvm_vcpu_arch { struct kvm_mmu *walk_mmu; struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; - struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_shadow_page_cache; + struct kvm_mmu_memory_cache mmu_gfn_array_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* @@ -683,7 +639,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; - int tdp_level; + int max_tdp_level; /* emulate context */ @@ -827,6 +783,9 @@ struct kvm_vcpu_arch { /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ bool l1tf_flush_l1d; + /* Host CPU on which VM-entry was most recently attempted */ + unsigned int last_vmentry_cpu; + /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; }; @@ -1083,7 +1042,7 @@ struct kvm_x86_ops { void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(u32 index); - void (*cpuid_update)(struct kvm_vcpu *vcpu); + void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; int (*vm_init)(struct kvm *kvm); @@ -1098,7 +1057,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_bp_intercept)(struct kvm_vcpu *vcpu); + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); @@ -1174,10 +1133,10 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level); bool (*has_wbinvd_exit)(void); @@ -1220,7 +1179,6 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); - int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; @@ -1281,6 +1239,7 @@ struct kvm_x86_nested_ops { struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); @@ -1304,7 +1263,7 @@ struct kvm_arch_async_pf { }; extern u64 __read_mostly host_efer; - +extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; #define __KVM_HAVE_ARCH_VM_ALLOC @@ -1549,20 +1508,8 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, bool skip_mmu_sync); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); - -static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception) -{ - return gpa; -} - -static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { @@ -1636,7 +1583,15 @@ asmlinkage void kvm_spurious_fault(void); insn "\n\t" \ "jmp 668f \n\t" \ "667: \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_begin \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "call kvm_spurious_fault \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_end \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "668: \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 49d3a9edb06f..338119852512 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -4,6 +4,7 @@ #include <asm/processor.h> #include <asm/alternative.h> +#include <linux/interrupt.h> #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); @@ -18,7 +19,7 @@ static inline bool kvm_check_and_clear_guest_paused(void) #endif /* CONFIG_KVM_GUEST */ #define KVM_HYPERCALL \ - ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) + ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL) /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h new file mode 100644 index 000000000000..08f1b57d3b62 --- /dev/null +++ b/arch/x86/include/asm/kvm_types.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVM_TYPES_H +#define _ASM_X86_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_X86_KVM_TYPES_H */ diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 444d6fd9a6d8..d86ab942219c 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,7 @@ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); +extern bool nopvspin; #define queued_spin_unlock queued_spin_unlock /** diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 233c77d056c9..08320b0b2b27 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -7,8 +7,11 @@ * Authors: Anthony Liguori <aliguori@us.ibm.com> */ +#define pr_fmt(fmt) "kvm-guest: " fmt + #include <linux/context_tracking.h> #include <linux/init.h> +#include <linux/irq.h> #include <linux/kernel.h> #include <linux/kvm_para.h> #include <linux/cpu.h> @@ -232,16 +235,11 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - u32 reason = kvm_read_and_reset_apf_flags(); + u32 flags = kvm_read_and_reset_apf_flags(); irqentry_state_t state; - switch (reason) { - case KVM_PV_REASON_PAGE_NOT_PRESENT: - case KVM_PV_REASON_PAGE_READY: - break; - default: + if (!flags) return false; - } state = irqentry_enter(regs); instrumentation_begin(); @@ -254,13 +252,13 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) if (unlikely(!(regs->flags & X86_EFLAGS_IF))) panic("Host injected async #PF in interrupt disabled region\n"); - if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { if (unlikely(!(user_mode(regs)))) panic("Host injected async #PF in kernel mode\n"); /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); } else { - kvm_async_pf_task_wake(token); + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); } instrumentation_end(); @@ -268,6 +266,27 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return true; } +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + u32 token; + irqentry_state_t state; + + state = irqentry_enter(regs); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + kvm_async_pf_task_wake(token); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + irqentry_exit(regs, state); + set_irq_regs(old_regs); +} + static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -289,8 +308,8 @@ static void kvm_register_steal_time(void) return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("kvm-stealtime: cpu %d, msr %llx\n", - cpu, (unsigned long long) slow_virt_to_phys(st)); + pr_info("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -311,17 +330,19 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa; + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); - pa |= KVM_ASYNC_PF_ENABLED; + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -493,7 +514,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); min = max = apic_id; ipi_bitmap = 0; } @@ -503,7 +525,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) if (ipi_bitmap) { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); } local_irq_restore(flags); @@ -533,7 +556,7 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - pr_info("KVM setup pv IPIs\n"); + pr_info("setup PV IPIs\n"); } static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) @@ -551,13 +574,6 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } -static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) -{ - native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - static_branch_disable(&virt_spin_lock_key); -} - static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -646,19 +662,20 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + } #ifdef CONFIG_SMP - smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; - pr_info("KVM setup pv sched yield\n"); + pr_info("setup PV sched yield\n"); } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) - pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); + pr_err("failed to install cpu hotplug callbacks\n"); #else sev_map_percpu_data(); kvm_guest_cpu_init(); @@ -854,16 +871,36 @@ asm( */ void __init kvm_spinlock_init(void) { - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); return; + } - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - return; + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } - /* Don't use the pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) - return; + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; @@ -876,6 +913,13 @@ void __init kvm_spinlock_init(void) pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ @@ -895,8 +939,8 @@ static void kvm_enable_host_haltpoll(void *i) void arch_haltpoll_enable(unsigned int cpu) { if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { - pr_err_once("kvm: host does not support poll control\n"); - pr_err_once("kvm: host upgrade recommended\n"); + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); return; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8a294f9747aa..fa873e3e6e90 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,28 +54,38 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -int kvm_update_cpuid(struct kvm_vcpu *vcpu) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - struct kvm_lapic *apic = vcpu->arch.apic; - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) - return 0; + /* + * The existing code assumes virtual address is 48-bit or 57-bit in the + * canonical address checks; exit if it is ever changed. + */ + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) { + int vaddr_bits = (best->eax & 0xff00) >> 8; + + if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) + return -EINVAL; + } - /* Update OSXSAVE bit */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) - cpuid_entry_change(best, X86_FEATURE_OSXSAVE, + return 0; +} + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best) { + /* Update OSXSAVE bit */ + if (boot_cpu_has(X86_FEATURE_XSAVE)) + cpuid_entry_change(best, X86_FEATURE_OSXSAVE, kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); - cpuid_entry_change(best, X86_FEATURE_APIC, + cpuid_entry_change(best, X86_FEATURE_APIC, vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); - - if (apic) { - if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; } best = kvm_find_cpuid_entry(vcpu, 7, 0); @@ -84,31 +94,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) { - vcpu->arch.guest_supported_xcr0 = 0; - } else { - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - } best = kvm_find_cpuid_entry(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - /* - * The existing code assumes virtual address is 48-bit or 57-bit in the - * canonical address checks; exit if it is ever changed. - */ - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best) { - int vaddr_bits = (best->eax & 0xff00) >> 8; - - if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) - return -EINVAL; - } - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) @@ -121,14 +114,39 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } +} + +static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *best; + + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best && apic) { + if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + + kvm_apic_set_version(vcpu); + } + + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) + vcpu->arch.guest_supported_xcr0 = 0; + else + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); - return 0; + vcpu->arch.cr4_guest_rsvd_bits = + __cr4_reserved_bits(guest_cpuid_has, vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); } static int is_efer_nx(void) @@ -203,10 +221,16 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries[i].padding[2] = 0; } vcpu->arch.cpuid_nent = cpuid->nent; + r = kvm_check_cpuid(vcpu); + if (r) { + vcpu->arch.cpuid_nent = 0; + kvfree(cpuid_entries); + goto out; + } + cpuid_fix_nx_cap(vcpu); - kvm_apic_set_version(vcpu); - kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); kvfree(cpuid_entries); out: @@ -227,9 +251,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); - kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); + r = kvm_check_cpuid(vcpu); + if (r) { + vcpu->arch.cpuid_nent = 0; + goto out; + } + + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); out: return r; } @@ -604,7 +633,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) eax.split.bit_width = cap.bit_width_gp; eax.split.mask_length = cap.events_mask_len; - edx.split.num_counters_fixed = cap.num_counters_fixed; + edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); edx.split.bit_width_fixed = cap.bit_width_fixed; edx.split.reserved = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 05434cd9342f..3a923ae15f2f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -9,7 +9,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); -int kvm_update_cpuid(struct kvm_vcpu *vcpu); +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4ce2ddd26c0b..5ccbee7165a2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -354,7 +354,6 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; if (!lapic_in_kernel(vcpu)) @@ -367,8 +366,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ - feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) && + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); @@ -2068,7 +2066,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; - kvm_lapic_set_reg(apic, APIC_TDCR, val); + kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb); update_divide_count(apic); if (apic->divide_count != old_divisor && apic->lapic_timer.period) { @@ -2085,7 +2083,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SELF_IPI: if (apic_x2apic_mode(apic)) { - kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + kvm_lapic_reg_write(apic, APIC_ICR, + APIC_DEST_SELF | (val & APIC_VECTOR_MASK)); } else ret = 1; break; @@ -2232,7 +2231,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid(vcpu |