diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2025-07-28 11:03:04 -0400 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2025-07-29 08:35:46 -0400 |
| commit | f02b1bcc73a17602903480562571069f0dff9f24 (patch) | |
| tree | 325828b3a747b77a069cde7e0df2f949152ac081 /virt | |
| parent | 65164fd0f6b50781fe27736be54e55535c9ad82d (diff) | |
| parent | 81bf24f1ac77029bf858c0da081088eb62b1b230 (diff) | |
| download | linux-f02b1bcc73a17602903480562571069f0dff9f24.tar.gz linux-f02b1bcc73a17602903480562571069f0dff9f24.tar.bz2 linux-f02b1bcc73a17602903480562571069f0dff9f24.zip | |
Merge tag 'kvm-x86-irqs-6.17' of https://github.com/kvm-x86/linux into HEAD
KVM IRQ changes for 6.17
- Rework irqbypass to track/match producers and consumers via an xarray
instead of a linked list. Using a linked list leads to O(n^2) insertion
times, which is hugely problematic for use cases that create large numbers
of VMs. Such use cases typically don't actually use irqbypass, but
eliminating the pointless registration is a future problem to solve as it
likely requires new uAPI.
- Track irqbypass's "token" as "struct eventfd_ctx *" instead of a "void *",
to avoid making a simple concept unnecessarily difficult to understand.
- Add CONFIG_KVM_IOAPIC for x86 to allow disabling support for I/O APIC, PIC,
and PIT emulation at compile time.
- Drop x86's irq_comm.c, and move a pile of IRQ related code into irq.c.
- Fix a variety of flaws and bugs in the AVIC device posted IRQ code.
- Inhibited AVIC if a vCPU's ID is too big (relative to what hardware
supports) instead of rejecting vCPU creation.
- Extend enable_ipiv module param support to SVM, by simply leaving IsRunning
clear in the vCPU's physical ID table entry.
- Disable IPI virtualization, via enable_ipiv, if the CPU is affected by
erratum #1235, to allow (safely) enabling AVIC on such CPUs.
- Dedup x86's device posted IRQ code, as the vast majority of functionality
can be shared verbatime between SVM and VMX.
- Harden the device posted IRQ code against bugs and runtime errors.
- Use vcpu_idx, not vcpu_id, for GA log tag/metadata, to make lookups O(1)
instead of O(n).
- Generate GA Log interrupts if and only if the target vCPU is blocking, i.e.
only if KVM needs a notification in order to wake the vCPU.
- Decouple device posted IRQs from VFIO device assignment, as binding a VM to
a VFIO group is not a requirement for enabling device posted IRQs.
- Clean up and document/comment the irqfd assignment code.
- Disallow binding multiple irqfds to an eventfd with a priority waiter, i.e.
ensure an eventfd is bound to at most one irqfd through the entire host,
and add a selftest to verify eventfd:irqfd bindings are globally unique.
Diffstat (limited to 'virt')
| -rw-r--r-- | virt/kvm/eventfd.c | 159 | ||||
| -rw-r--r-- | virt/kvm/irqchip.c | 2 | ||||
| -rw-r--r-- | virt/lib/irqbypass.c | 190 |
3 files changed, 171 insertions, 180 deletions
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 11e5d1e3f12e..6b1133a6617f 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -204,6 +204,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) int ret = 0; if (flags & EPOLLIN) { + /* + * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, + * as KVM holds irqfds.lock when registering the irqfd with the + * eventfd. + */ u64 cnt; eventfd_ctx_do_read(irqfd->eventfd, &cnt); @@ -225,6 +230,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) /* The eventfd is closing, detach from KVM */ unsigned long iflags; + /* + * Taking irqfds.lock is safe here, as KVM holds a reference to + * the eventfd when registering the irqfd, i.e. this path can't + * be reached while kvm_irqfd_add() is running. + */ spin_lock_irqsave(&kvm->irqfds.lock, iflags); /* @@ -245,22 +255,14 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) return ret; } -static void -irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, - poll_table *pt) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(pt, struct kvm_kernel_irqfd, pt); - add_wait_queue_priority(wqh, &irqfd->wait); -} - -/* Must be called under irqfds.lock */ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; int n_entries; + lockdep_assert_held(&kvm->irqfds.lock); + n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); write_seqcount_begin(&irqfd->irq_entry_sc); @@ -274,6 +276,63 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) write_seqcount_end(&irqfd->irq_entry_sc); } +struct kvm_irqfd_pt { + struct kvm_kernel_irqfd *irqfd; + struct kvm *kvm; + poll_table pt; + int ret; +}; + +static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); + struct kvm_kernel_irqfd *irqfd = p->irqfd; + struct kvm *kvm = p->kvm; + + /* + * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, + * and irqfds.items. It does NOT protect registering with the eventfd. + */ + spin_lock_irq(&kvm->irqfds.lock); + + /* + * Initialize the routing information prior to adding the irqfd to the + * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the + * irqfd is registered. + */ + irqfd_update(kvm, irqfd); + + /* + * Add the irqfd as a priority waiter on the eventfd, with a custom + * wake-up handler, so that KVM *and only KVM* is notified whenever the + * underlying eventfd is signaled. + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + + /* + * Temporarily lie to lockdep about holding irqfds.lock to avoid a + * false positive regarding potential deadlock with irqfd_wakeup() + * (see irqfd_wakeup() for details). + * + * Adding to the wait queue will fail if there is already a priority + * waiter, i.e. if the eventfd is associated with another irqfd (in any + * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown + * jobs to complete, i.e. ensures the irqfd has been removed from the + * eventfd's waitqueue before returning to userspace. + */ + spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); + p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); + spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); + if (p->ret) + goto out; + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + +out: + spin_unlock_irq(&kvm->irqfds.lock); +} + #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) void __attribute__((weak)) kvm_arch_irq_bypass_stop( struct irq_bypass_consumer *cons) @@ -285,26 +344,20 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __attribute__((weak)) kvm_arch_update_irqfd_routing( - struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return 0; -} -bool __attribute__((weak)) kvm_arch_irqfd_route_changed( - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - return true; } #endif static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct kvm_kernel_irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct kvm_irqfd_pt irqfd_pt; int ret; __poll_t events; int idx; @@ -390,57 +443,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } /* - * Install our own custom wake-up handling so we are notified via - * a callback whenever someone signals the underlying eventfd + * Set the irqfd routing and add it to KVM's list before registering + * the irqfd with the eventfd, so that the routing information is valid + * and stays valid, e.g. if there are GSI routing changes, prior to + * making the irqfd visible, i.e. before it might be signaled. + * + * Note, holding SRCU ensures a stable read of routing information, and + * also prevents irqfd_shutdown() from freeing the irqfd before it's + * fully initialized. */ - init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); - - spin_lock_irq(&kvm->irqfds.lock); - - ret = 0; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - ret = -EBUSY; - spin_unlock_irq(&kvm->irqfds.lock); - goto fail; - } - idx = srcu_read_lock(&kvm->irq_srcu); - irqfd_update(kvm, irqfd); - - list_add_tail(&irqfd->list, &kvm->irqfds.items); - - spin_unlock_irq(&kvm->irqfds.lock); /* - * Check if there was an event already pending on the eventfd - * before we registered, and trigger it as if we didn't miss it. + * Register the irqfd with the eventfd by polling on the eventfd, and + * simultaneously and the irqfd to KVM's list. If there was en event + * pending on the eventfd prior to registering, manually trigger IRQ + * injection. */ - events = vfs_poll(fd_file(f), &irqfd->pt); + irqfd_pt.irqfd = irqfd; + irqfd_pt.kvm = kvm; + init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); + + events = vfs_poll(fd_file(f), &irqfd_pt.pt); + + ret = irqfd_pt.ret; + if (ret) + goto fail_poll; if (events & EPOLLIN) schedule_work(&irqfd->inject); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (kvm_arch_has_irq_bypass()) { - irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; irqfd->consumer.stop = kvm_arch_irq_bypass_stop; irqfd->consumer.start = kvm_arch_irq_bypass_start; - ret = irq_bypass_register_consumer(&irqfd->consumer); + ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); if (ret) - pr_info("irq bypass consumer (token %p) registration fails: %d\n", - irqfd->consumer.token, ret); + pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", + irqfd->eventfd, ret); } #endif srcu_read_unlock(&kvm->irq_srcu, idx); return 0; +fail_poll: + srcu_read_unlock(&kvm->irq_srcu, idx); fail: if (irqfd->resampler) irqfd_resampler_shutdown(irqfd); @@ -617,13 +667,8 @@ void kvm_irq_routing_update(struct kvm *kvm) irqfd_update(kvm, irqfd); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) - if (irqfd->producer && - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { - int ret = kvm_arch_update_irqfd_routing( - irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, 1); - WARN_ON(ret); - } + if (irqfd->producer) + kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 162d8ed889f2..6ccabfd32287 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); - kvm_arch_post_irq_routing_update(kvm); - synchronize_srcu_expedited(&kvm->irq_srcu); new = old; diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 28fda42e471b..62c160200be9 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -22,8 +22,8 @@ MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IRQ bypass manager utility module"); -static LIST_HEAD(producers); -static LIST_HEAD(consumers); +static DEFINE_XARRAY(producers); +static DEFINE_XARRAY(consumers); static DEFINE_MUTEX(lock); /* @lock must be held when calling connect */ @@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod, if (prod->start) prod->start(prod); + if (!ret) { + prod->consumer = cons; + cons->producer = prod; + } return ret; } @@ -72,56 +76,49 @@ static void __disconnect(struct irq_bypass_producer *prod, cons->start(cons); if (prod->start) prod->start(prod); + + prod->consumer = NULL; + cons->producer = NULL; } /** * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure + * @eventfd: pointer to the eventfd context associated with the producer + * @irq: Linux IRQ number of the underlying producer device * - * Add the provided IRQ producer to the list of producers and connect - * with any matching token found on the IRQ consumers list. + * Add the provided IRQ producer to the set of producers and connect with the + * consumer with a matching eventfd, if one exists. */ -int irq_bypass_register_producer(struct irq_bypass_producer *producer) +int irq_bypass_register_producer(struct irq_bypass_producer *producer, + struct eventfd_ctx *eventfd, int irq) { - struct irq_bypass_producer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_consumer *consumer; int ret; - if (!producer->token) + if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; - might_sleep(); - - if (!try_module_get(THIS_MODULE)) - return -ENODEV; + producer->irq = irq; - mutex_lock(&lock); + guard(mutex)(&lock); - list_for_each_entry(tmp, &producers, node) { - if (tmp->token == producer->token) { - ret = -EBUSY; - goto out_err; - } - } + ret = xa_insert(&producers, index, producer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { - ret = __connect(producer, consumer); - if (ret) - goto out_err; - break; + consumer = xa_load(&consumers, index); + if (consumer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&producers, index) != producer); + return ret; } } - list_add(&producer->node, &producers); - - mutex_unlock(&lock); - + producer->eventfd = eventfd; return 0; -out_err: - mutex_unlock(&lock); - module_put(THIS_MODULE); - return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -129,95 +126,65 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer); * irq_bypass_unregister_producer - unregister IRQ bypass producer * @producer: pointer to producer structure * - * Remove a previously registered IRQ producer from the list of producers - * and disconnect it from any connected IRQ consumer. + * Remove a previously registered IRQ producer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * consumer, if one exists. */ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) { - struct irq_bypass_producer *tmp; - struct irq_bypass_consumer *consumer; + unsigned long index = (unsigned long)producer->eventfd; - if (!producer->token) + if (!producer->eventfd) return; - might_sleep(); - - if (!try_module_get(THIS_MODULE)) - return; /* nothing in the list anyway */ - - mutex_lock(&lock); - - list_for_each_entry(tmp, &producers, node) { - if (tmp->token != producer->token) - continue; - - list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { - __disconnect(producer, consumer); - break; - } - } - - list_del(&producer->node); - module_put(THIS_MODULE); - break; - } + guard(mutex)(&lock); - mutex_unlock(&lock); + if (producer->consumer) + __disconnect(producer, producer->consumer); - module_put(THIS_MODULE); + WARN_ON_ONCE(xa_erase(&producers, index) != producer); + producer->eventfd = NULL; } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); /** * irq_bypass_register_consumer - register IRQ bypass consumer * @consumer: pointer to consumer structure + * @eventfd: pointer to the eventfd context associated with the consumer * - * Add the provided IRQ consumer to the list of consumers and connect - * with any matching token found on the IRQ producer list. + * Add the provided IRQ consumer to the set of consumers and connect with the + * producer with a matching eventfd, if one exists. */ -int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, + struct eventfd_ctx *eventfd) { - struct irq_bypass_consumer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_producer *producer; int ret; - if (!consumer->token || - !consumer->add_producer || !consumer->del_producer) + if (WARN_ON_ONCE(consumer->eventfd)) return -EINVAL; - might_sleep(); - - if (!try_module_get(THIS_MODULE)) - return -ENODEV; + if (!consumer->add_producer || !consumer->del_producer) + return -EINVAL; - mutex_lock(&lock); + guard(mutex)(&lock); - list_for_each_entry(tmp, &consumers, node) { - if (tmp->token == consumer->token || tmp == consumer) { - ret = -EBUSY; - goto out_err; - } - } + ret = xa_insert(&consumers, index, consumer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { - ret = __connect(producer, consumer); - if (ret) - goto out_err; - break; + producer = xa_load(&producers, index); + if (producer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); + return ret; } } - list_add(&consumer->node, &consumers); - - mutex_unlock(&lock); - + consumer->eventfd = eventfd; return 0; -out_err: - mutex_unlock(&lock); - module_put(THIS_MODULE); - return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -225,42 +192,23 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); * irq_bypass_unregister_consumer - unregister IRQ bypass consumer * @consumer: pointer to consumer structure * - * Remove a previously registered IRQ consumer from the list of consumers - * and disconnect it from any connected IRQ producer. + * Remove a previously registered IRQ consumer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * producer, if one exists. */ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) { - struct irq_bypass_consumer *tmp; - struct irq_bypass_producer *producer; + unsigned long index = (unsigned long)consumer->eventfd; - if (!consumer->token) + if (!consumer->eventfd) return; - might_sleep(); - - if (!try_module_get(THIS_MODULE)) - return; /* nothing in the list anyway */ - - mutex_lock(&lock); - - list_for_each_entry(tmp, &consumers, node) { - if (tmp != consumer) - continue; - - list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { - __disconnect(producer, consumer); - break; - } - } - - list_del(&consumer->node); - module_put(THIS_MODULE); - break; - } + guard(mutex)(&lock); - mutex_unlock(&lock); + if (consumer->producer) + __disconnect(consumer->producer, consumer); - module_put(THIS_MODULE); + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); + consumer->eventfd = NULL; } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); |
