From 44d17459626052a2390457e550a12cb973506b2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:51 -0700 Subject: KVM: Use dedicated mutex to protect kvm_usage_count to avoid deadlock Use a dedicated mutex to guard kvm_usage_count to fix a potential deadlock on x86 due to a chain of locks and SRCU synchronizations. Translating the below lockdep splat, CPU1 #6 will wait on CPU0 #1, CPU0 #8 will wait on CPU2 #3, and CPU2 #7 will wait on CPU1 #4 (if there's a writer, due to the fairness of r/w semaphores). CPU0 CPU1 CPU2 1 lock(&kvm->slots_lock); 2 lock(&vcpu->mutex); 3 lock(&kvm->srcu); 4 lock(cpu_hotplug_lock); 5 lock(kvm_lock); 6 lock(&kvm->slots_lock); 7 lock(cpu_hotplug_lock); 8 sync(&kvm->srcu); Note, there are likely more potential deadlocks in KVM x86, e.g. the same pattern of taking cpu_hotplug_lock outside of kvm_lock likely exists with __kvmclock_cpufreq_notifier(): cpuhp_cpufreq_online() | -> cpufreq_online() | -> cpufreq_gov_performance_limits() | -> __cpufreq_driver_target() | -> __target_index() | -> cpufreq_freq_transition_begin() | -> cpufreq_notify_transition() | -> ... __kvmclock_cpufreq_notifier() But, actually triggering such deadlocks is beyond rare due to the combination of dependencies and timings involved. E.g. the cpufreq notifier is only used on older CPUs without a constant TSC, mucking with the NX hugepage mitigation while VMs are running is very uncommon, and doing so while also onlining/offlining a CPU (necessary to generate contention on cpu_hotplug_lock) would be even more unusual. The most robust solution to the general cpu_hotplug_lock issue is likely to switch vm_list to be an RCU-protected list, e.g. so that x86's cpufreq notifier doesn't to take kvm_lock. For now, settle for fixing the most blatant deadlock, as switching to an RCU-protected list is a much more involved change, but add a comment in locking.rst to call out that care needs to be taken when walking holding kvm_lock and walking vm_list. ====================================================== WARNING: possible circular locking dependency detected 6.10.0-smp--c257535a0c9d-pip #330 Tainted: G S O ------------------------------------------------------ tee/35048 is trying to acquire lock: ff6a80eced71e0a8 (&kvm->slots_lock){+.+.}-{3:3}, at: set_nx_huge_pages+0x179/0x1e0 [kvm] but task is already holding lock: ffffffffc07abb08 (kvm_lock){+.+.}-{3:3}, at: set_nx_huge_pages+0x14a/0x1e0 [kvm] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (kvm_lock){+.+.}-{3:3}: __mutex_lock+0x6a/0xb40 mutex_lock_nested+0x1f/0x30 kvm_dev_ioctl+0x4fb/0xe50 [kvm] __se_sys_ioctl+0x7b/0xd0 __x64_sys_ioctl+0x21/0x30 x64_sys_call+0x15d0/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #2 (cpu_hotplug_lock){++++}-{0:0}: cpus_read_lock+0x2e/0xb0 static_key_slow_inc+0x16/0x30 kvm_lapic_set_base+0x6a/0x1c0 [kvm] kvm_set_apic_base+0x8f/0xe0 [kvm] kvm_set_msr_common+0x9ae/0xf80 [kvm] vmx_set_msr+0xa54/0xbe0 [kvm_intel] __kvm_set_msr+0xb6/0x1a0 [kvm] kvm_arch_vcpu_ioctl+0xeca/0x10c0 [kvm] kvm_vcpu_ioctl+0x485/0x5b0 [kvm] __se_sys_ioctl+0x7b/0xd0 __x64_sys_ioctl+0x21/0x30 x64_sys_call+0x15d0/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #1 (&kvm->srcu){.+.+}-{0:0}: __synchronize_srcu+0x44/0x1a0 synchronize_srcu_expedited+0x21/0x30 kvm_swap_active_memslots+0x110/0x1c0 [kvm] kvm_set_memslot+0x360/0x620 [kvm] __kvm_set_memory_region+0x27b/0x300 [kvm] kvm_vm_ioctl_set_memory_region+0x43/0x60 [kvm] kvm_vm_ioctl+0x295/0x650 [kvm] __se_sys_ioctl+0x7b/0xd0 __x64_sys_ioctl+0x21/0x30 x64_sys_call+0x15d0/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #0 (&kvm->slots_lock){+.+.}-{3:3}: __lock_acquire+0x15ef/0x2e30 lock_acquire+0xe0/0x260 __mutex_lock+0x6a/0xb40 mutex_lock_nested+0x1f/0x30 set_nx_huge_pages+0x179/0x1e0 [kvm] param_attr_store+0x93/0x100 module_attr_store+0x22/0x40 sysfs_kf_write+0x81/0xb0 kernfs_fop_write_iter+0x133/0x1d0 vfs_write+0x28d/0x380 ksys_write+0x70/0xe0 __x64_sys_write+0x1f/0x30 x64_sys_call+0x281b/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Cc: Chao Gao Fixes: 0bf50497f03b ("KVM: Drop kvm_count_lock and instead protect kvm_usage_count with kvm_lock") Cc: stable@vger.kernel.org Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb2b78e92910..7164a9ece208 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5575,6 +5575,7 @@ __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); static DEFINE_PER_CPU(bool, hardware_enabled); +static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; static int __hardware_enable_nolock(void) @@ -5607,10 +5608,10 @@ static int kvm_online_cpu(unsigned int cpu) * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); if (kvm_usage_count) ret = __hardware_enable_nolock(); - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); return ret; } @@ -5630,10 +5631,10 @@ static void hardware_disable_nolock(void *junk) static int kvm_offline_cpu(unsigned int cpu) { - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); if (kvm_usage_count) hardware_disable_nolock(NULL); - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); return 0; } @@ -5649,9 +5650,9 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { cpus_read_lock(); - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); hardware_disable_all_nolock(); - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); cpus_read_unlock(); } @@ -5682,7 +5683,7 @@ static int hardware_enable_all(void) * enable hardware multiple times. */ cpus_read_lock(); - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); r = 0; @@ -5696,7 +5697,7 @@ static int hardware_enable_all(void) } } - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); cpus_read_unlock(); return r; @@ -5724,13 +5725,13 @@ static int kvm_suspend(void) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count - * is stable. Assert that kvm_lock is not held to ensure the system - * isn't suspended while KVM is enabling hardware. Hardware enabling - * can be preempted, but the task cannot be frozen until it has dropped - * all locks (userspace tasks are frozen via a fake signal). + * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage + * count is stable. Assert that kvm_usage_lock is not held to ensure + * the system isn't suspended while KVM is enabling hardware. Hardware + * enabling can be preempted, but the task cannot be frozen until it has + * dropped all locks (userspace tasks are frozen via a fake signal). */ - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); if (kvm_usage_count) @@ -5740,7 +5741,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); if (kvm_usage_count) -- cgit v1.2.3 From 9a798b1337afe4fdcce53efa77953b068d8614f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:52 -0700 Subject: KVM: Register cpuhp and syscore callbacks when enabling hardware Register KVM's cpuhp and syscore callback when enabling virtualization in hardware instead of registering the callbacks during initialization, and let the CPU up/down framework invoke the inner enable/disable functions. Registering the callbacks during initialization makes things more complex than they need to be, as KVM needs to be very careful about handling races between enabling CPUs being onlined/offlined and hardware being enabled/disabled. Intel TDX support will require KVM to enable virtualization during KVM initialization, i.e. will add another wrinkle to things, at which point sorting out the potential races with kvm_usage_count would become even more complex. Note, using the cpuhp framework has a subtle behavioral change: enabling will be done serially across all CPUs, whereas KVM currently sends an IPI to all CPUs in parallel. While serializing virtualization enabling could create undesirable latency, the issue is limited to the 0=>1 transition of VM creation. And even that can be mitigated, e.g. by letting userspace force virtualization to be enabled when KVM is initialized. Cc: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 174 ++++++++++++++++++---------------------------------- 1 file changed, 61 insertions(+), 113 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7164a9ece208..9ad8903b575a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5578,7 +5578,7 @@ static DEFINE_PER_CPU(bool, hardware_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; -static int __hardware_enable_nolock(void) +static int hardware_enable_nolock(void) { if (__this_cpu_read(hardware_enabled)) return 0; @@ -5593,34 +5593,18 @@ static int __hardware_enable_nolock(void) return 0; } -static void hardware_enable_nolock(void *failed) -{ - if (__hardware_enable_nolock()) - atomic_inc(failed); -} - static int kvm_online_cpu(unsigned int cpu) { - int ret = 0; - /* * Abort the CPU online process if hardware virtualization cannot * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - mutex_lock(&kvm_usage_lock); - if (kvm_usage_count) - ret = __hardware_enable_nolock(); - mutex_unlock(&kvm_usage_lock); - return ret; + return hardware_enable_nolock(); } static void hardware_disable_nolock(void *junk) { - /* - * Note, hardware_disable_all_nolock() tells all online CPUs to disable - * hardware, not just CPUs that successfully enabled hardware! - */ if (!__this_cpu_read(hardware_enabled)) return; @@ -5631,78 +5615,10 @@ static void hardware_disable_nolock(void *junk) static int kvm_offline_cpu(unsigned int cpu) { - mutex_lock(&kvm_usage_lock); - if (kvm_usage_count) - hardware_disable_nolock(NULL); - mutex_unlock(&kvm_usage_lock); + hardware_disable_nolock(NULL); return 0; } -static void hardware_disable_all_nolock(void) -{ - BUG_ON(!kvm_usage_count); - - kvm_usage_count--; - if (!kvm_usage_count) - on_each_cpu(hardware_disable_nolock, NULL, 1); -} - -static void hardware_disable_all(void) -{ - cpus_read_lock(); - mutex_lock(&kvm_usage_lock); - hardware_disable_all_nolock(); - mutex_unlock(&kvm_usage_lock); - cpus_read_unlock(); -} - -static int hardware_enable_all(void) -{ - atomic_t failed = ATOMIC_INIT(0); - int r; - - /* - * Do not enable hardware virtualization if the system is going down. - * If userspace initiated a forced reboot, e.g. reboot -f, then it's - * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling - * after kvm_reboot() is called. Note, this relies on system_state - * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops - * hook instead of registering a dedicated reboot notifier (the latter - * runs before system_state is updated). - */ - if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || - system_state == SYSTEM_RESTART) - return -EBUSY; - - /* - * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() - * is called, and so on_each_cpu() between them includes the CPU that - * is being onlined. As a result, hardware_enable_nolock() may get - * invoked before kvm_online_cpu(), which also enables hardware if the - * usage count is non-zero. Disable CPU hotplug to avoid attempting to - * enable hardware multiple times. - */ - cpus_read_lock(); - mutex_lock(&kvm_usage_lock); - - r = 0; - - kvm_usage_count++; - if (kvm_usage_count == 1) { - on_each_cpu(hardware_enable_nolock, &failed, 1); - - if (atomic_read(&failed)) { - hardware_disable_all_nolock(); - r = -EBUSY; - } - } - - mutex_unlock(&kvm_usage_lock); - cpus_read_unlock(); - - return r; -} - static void kvm_shutdown(void) { /* @@ -5734,8 +5650,7 @@ static int kvm_suspend(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - hardware_disable_nolock(NULL); + hardware_disable_nolock(NULL); return 0; } @@ -5744,8 +5659,7 @@ static void kvm_resume(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - WARN_ON_ONCE(__hardware_enable_nolock()); + WARN_ON_ONCE(hardware_enable_nolock()); } static struct syscore_ops kvm_syscore_ops = { @@ -5753,6 +5667,60 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, .shutdown = kvm_shutdown, }; + +static int hardware_enable_all(void) +{ + int r; + + guard(mutex)(&kvm_usage_lock); + + if (kvm_usage_count++) + return 0; + + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); + if (r) + goto err_cpuhp; + + register_syscore_ops(&kvm_syscore_ops); + + /* + * Undo virtualization enabling and bail if the system is going down. + * If userspace initiated a forced reboot, e.g. reboot -f, then it's + * possible for an in-flight operation to enable virtualization after + * syscore_shutdown() is called, i.e. without kvm_shutdown() being + * invoked. Note, this relies on system_state being set _before_ + * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked + * or this CPU observes the impending shutdown. Which is why KVM uses + * a syscore ops hook instead of registering a dedicated reboot + * notifier (the latter runs before system_state is updated). + */ + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || + system_state == SYSTEM_RESTART) { + r = -EBUSY; + goto err_rebooting; + } + + return 0; + +err_rebooting: + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); +err_cpuhp: + --kvm_usage_count; + return r; +} + +static void hardware_disable_all(void) +{ + guard(mutex)(&kvm_usage_lock); + + if (--kvm_usage_count) + return; + + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); +} #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int hardware_enable_all(void) { @@ -6461,15 +6429,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", - kvm_online_cpu, kvm_offline_cpu); - if (r) - return r; - - register_syscore_ops(&kvm_syscore_ops); -#endif - /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); @@ -6480,10 +6439,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) offsetofend(struct kvm_vcpu, stats_id) - offsetof(struct kvm_vcpu, arch), NULL); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto err_vcpu_cache; - } + if (!kvm_vcpu_cache) + return -ENOMEM; for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), @@ -6540,11 +6497,6 @@ err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -err_vcpu_cache: -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6566,10 +6518,6 @@ void kvm_exit(void) kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3 From 70c0194337d38dd29533e63e3cb07620f8c5eae1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:53 -0700 Subject: KVM: Rename symbols related to enabling virtualization hardware Rename the various functions (and a variable) that enable virtualization to prepare for upcoming changes, and to clean up artifacts of KVM's previous behavior, which required manually juggling locks around kvm_usage_count. Drop the "nolock" qualifier from per-CPU functions now that there are no "nolock" implementations of the "all" variants, i.e. now that calling a non-nolock function from a nolock function isn't confusing (unlike this sentence). Drop "all" from the outer helpers as they no longer manually iterate over all CPUs, and because it might not be obvious what "all" refers to. In lieu of the above qualifiers, append "_cpu" to the end of the functions that are per-CPU helpers for the outer APIs. Opportunistically prepend "kvm" to all functions to help make it clear that they are KVM helpers, but mostly because there's no reason not to. Lastly, use "virtualization" instead of "hardware", because while the functions do enable virtualization in hardware, there are a _lot_ of things that KVM enables in hardware. Defer renaming the arch hooks to future patches, purely to reduce the amount of churn in a single commit. Reviewed-by: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9ad8903b575a..4efbf9f84149 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -136,8 +136,8 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file) #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif -static int hardware_enable_all(void); -static void hardware_disable_all(void); +static int kvm_enable_virtualization(void); +static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); @@ -1220,7 +1220,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_arch_destroy_vm; - r = hardware_enable_all(); + r = kvm_enable_virtualization(); if (r) goto out_err_no_disable; @@ -1263,7 +1263,7 @@ out_no_coalesced_mmio: mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif out_err_no_mmu_notifier: - hardware_disable_all(); + kvm_disable_virtualization(); out_err_no_disable: kvm_arch_destroy_vm(kvm); out_err_no_arch_destroy_vm: @@ -1360,7 +1360,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); - hardware_disable_all(); + kvm_disable_virtualization(); mmdrop(mm); } @@ -5574,13 +5574,13 @@ static struct miscdevice kvm_dev = { __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -static DEFINE_PER_CPU(bool, hardware_enabled); +static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; -static int hardware_enable_nolock(void) +static int kvm_enable_virtualization_cpu(void) { - if (__this_cpu_read(hardware_enabled)) + if (__this_cpu_read(virtualization_enabled)) return 0; if (kvm_arch_hardware_enable()) { @@ -5589,7 +5589,7 @@ static int hardware_enable_nolock(void) return -EIO; } - __this_cpu_write(hardware_enabled, true); + __this_cpu_write(virtualization_enabled, true); return 0; } @@ -5600,22 +5600,22 @@ static int kvm_online_cpu(unsigned int cpu) * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - return hardware_enable_nolock(); + return kvm_enable_virtualization_cpu(); } -static void hardware_disable_nolock(void *junk) +static void kvm_disable_virtualization_cpu(void *ign) { - if (!__this_cpu_read(hardware_enabled)) + if (!__this_cpu_read(virtualization_enabled)) return; kvm_arch_hardware_disable(); - __this_cpu_write(hardware_enabled, false); + __this_cpu_write(virtualization_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) { - hardware_disable_nolock(NULL); + kvm_disable_virtualization_cpu(NULL); return 0; } @@ -5634,7 +5634,7 @@ static void kvm_shutdown(void) */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; - on_each_cpu(hardware_disable_nolock, NULL, 1); + on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } static int kvm_suspend(void) @@ -5650,7 +5650,7 @@ static int kvm_suspend(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - hardware_disable_nolock(NULL); + kvm_disable_virtualization_cpu(NULL); return 0; } @@ -5659,7 +5659,7 @@ static void kvm_resume(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(hardware_enable_nolock()); + WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } static struct syscore_ops kvm_syscore_ops = { @@ -5668,7 +5668,7 @@ static struct syscore_ops kvm_syscore_ops = { .shutdown = kvm_shutdown, }; -static int hardware_enable_all(void) +static int kvm_enable_virtualization(void) { int r; @@ -5711,7 +5711,7 @@ err_cpuhp: return r; } -static void hardware_disable_all(void) +static void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5722,12 +5722,12 @@ static void hardware_disable_all(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ -static int hardware_enable_all(void) +static int kvm_enable_virtualization(void) { return 0; } -static void hardware_disable_all(void) +static void kvm_disable_virtualization(void) { } -- cgit v1.2.3 From 071f24ad28cdcdfa544fdb79b1b1d2b423717a11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:54 -0700 Subject: KVM: Rename arch hooks related to per-CPU virtualization enabling Rename the per-CPU hooks used to enable virtualization in hardware to align with the KVM-wide helpers in kvm_main.c, and to better capture that the callbacks are invoked on every online CPU. No functional change intended. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20240830043600.127750-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4efbf9f84149..7ada2b669d51 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5583,7 +5583,7 @@ static int kvm_enable_virtualization_cpu(void) if (__this_cpu_read(virtualization_enabled)) return 0; - if (kvm_arch_hardware_enable()) { + if (kvm_arch_enable_virtualization_cpu()) { pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); return -EIO; @@ -5608,7 +5608,7 @@ static void kvm_disable_virtualization_cpu(void *ign) if (!__this_cpu_read(virtualization_enabled)) return; - kvm_arch_hardware_disable(); + kvm_arch_disable_virtualization_cpu(); __this_cpu_write(virtualization_enabled, false); } -- cgit v1.2.3 From b4886fab6fb620b96ad7eeefb9801c42dfa91741 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:57 -0700 Subject: KVM: Add a module param to allow enabling virtualization when KVM is loaded Add an on-by-default module param, enable_virt_at_load, to let userspace force virtualization to be enabled in hardware when KVM is initialized, i.e. just before /dev/kvm is exposed to userspace. Enabling virtualization during KVM initialization allows userspace to avoid the additional latency when creating/destroying the first/last VM (or more specifically, on the 0=>1 and 1=>0 edges of creation/destruction). Now that KVM uses the cpuhp framework to do per-CPU enabling, the latency could be non-trivial as the cpuhup bringup/teardown is serialized across CPUs, e.g. the latency could be problematic for use case that need to spin up VMs quickly. Prior to commit 10474ae8945c ("KVM: Activate Virtualization On Demand"), KVM _unconditionally_ enabled virtualization during load, i.e. there's no fundamental reason KVM needs to dynamically toggle virtualization. These days, the only known argument for not enabling virtualization is to allow KVM to be autoloaded without blocking other out-of-tree hypervisors, and such use cases can simply change the module param, e.g. via command line. Note, the aforementioned commit also mentioned that enabling SVM (AMD's virtualization extensions) can result in "using invalid TLB entries". It's not clear whether the changelog was referring to a KVM bug, a CPU bug, or something else entirely. Regardless, leaving virtualization off by default is not a robust "fix", as any protection provided is lost the instant userspace creates the first VM. Reviewed-by: Chao Gao Acked-by: Kai Huang Reviewed-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7ada2b669d51..3a18da68b0ca 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5571,6 +5571,9 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +static bool enable_virt_at_load = true; +module_param(enable_virt_at_load, bool, 0444); + __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -5721,15 +5724,39 @@ static void kvm_disable_virtualization(void) unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); } + +static int kvm_init_virtualization(void) +{ + if (enable_virt_at_load) + return kvm_enable_virtualization(); + + return 0; +} + +static void kvm_uninit_virtualization(void) +{ + if (enable_virt_at_load) + kvm_disable_virtualization(); +} #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int kvm_enable_virtualization(void) { return 0; } +static int kvm_init_virtualization(void) +{ + return 0; +} + static void kvm_disable_virtualization(void) { +} + +static void kvm_uninit_virtualization(void) +{ + } #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ @@ -6474,6 +6501,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) kvm_gmem_init(module); + r = kvm_init_virtualization(); + if (r) + goto err_virt; + /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! @@ -6487,6 +6518,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) return 0; err_register: + kvm_uninit_virtualization(); +err_virt: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6512,6 +6545,8 @@ void kvm_exit(void) */ misc_deregister(&kvm_dev); + kvm_uninit_virtualization(); + debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); -- cgit v1.2.3 From b67107a251b01161956892352d53fb122346eda1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:58 -0700 Subject: KVM: Add arch hooks for enabling/disabling virtualization Add arch hooks that are invoked when KVM enables/disable virtualization. x86 will use the hooks to register an "emergency disable" callback, which is essentially an x86-specific shutdown notifier that is used when the kernel is doing an emergency reboot/shutdown/kexec. Add comments for the declarations to help arch code understand exactly when the callbacks are invoked. Alternatively, the APIs themselves could communicate most of the same info, but kvm_arch_pre_enable_virtualization() and kvm_arch_post_disable_virtualization() are a bit cumbersome, and make it a bit less obvious that they are intended to be implemented as a pair. Reviewed-by: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3a18da68b0ca..5f9a66df7bfb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5581,6 +5581,16 @@ static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; +__weak void kvm_arch_enable_virtualization(void) +{ + +} + +__weak void kvm_arch_disable_virtualization(void) +{ + +} + static int kvm_enable_virtualization_cpu(void) { if (__this_cpu_read(virtualization_enabled)) @@ -5680,6 +5690,8 @@ static int kvm_enable_virtualization(void) if (kvm_usage_count++) return 0; + kvm_arch_enable_virtualization(); + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) @@ -5710,6 +5722,7 @@ err_rebooting: unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); err_cpuhp: + kvm_arch_disable_virtualization(); --kvm_usage_count; return r; } @@ -5723,6 +5736,7 @@ static void kvm_disable_virtualization(void) unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); + kvm_arch_disable_virtualization(); } static int kvm_init_virtualization(void) -- cgit v1.2.3