From fa079a0616edbcdad538128306abbc19b68a9863 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:27 -0700 Subject: irqbypass: Drop pointless and misleading THIS_MODULE get/put Drop irqbypass.ko's superfluous and misleading get/put calls on THIS_MODULE. A module taking a reference to itself is useless; no amount of checks will prevent doom and destruction if the caller hasn't already guaranteed the liveliness of the module (this goes for any module). E.g. if try_module_get() fails because irqbypass.ko is being unloaded, then the kernel has already hit a use-after-free by virtue of executing code whose lifecycle is tied to irqbypass.ko. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'virt') diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 28fda42e471b..080c706f3b01 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -92,9 +92,6 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -120,7 +117,6 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) return 0; out_err: mutex_unlock(&lock); - module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -142,9 +138,6 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return; /* nothing in the list anyway */ - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -159,13 +152,10 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) } list_del(&producer->node); - module_put(THIS_MODULE); break; } mutex_unlock(&lock); - - module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -188,9 +178,6 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { @@ -216,7 +203,6 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) return 0; out_err: mutex_unlock(&lock); - module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -238,9 +224,6 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return; /* nothing in the list anyway */ - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { @@ -255,12 +238,9 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) } list_del(&consumer->node); - module_put(THIS_MODULE); break; } mutex_unlock(&lock); - - module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); -- cgit v1.2.3 From 07fbc83c01520c62c89f6495f2f0bea2f4ac6684 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:28 -0700 Subject: irqbypass: Drop superfluous might_sleep() annotations Drop superfluous might_sleep() annotations from irqbypass, mutex_lock() provides all of the necessary tracking. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'virt') diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 080c706f3b01..28a4d933569a 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -90,8 +90,6 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) if (!producer->token) return -EINVAL; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -136,8 +134,6 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) if (!producer->token) return; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -176,8 +172,6 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) !consumer->add_producer || !consumer->del_producer) return -EINVAL; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { @@ -222,8 +216,6 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) if (!consumer->token) return; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { -- cgit v1.2.3 From 2b521d86ee80a436a92445b8206d38d75aeb39ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:29 -0700 Subject: irqbypass: Take ownership of producer/consumer token tracking Move ownership of IRQ bypass token tracking into irqbypass.ko, and explicitly require callers to pass an eventfd_ctx structure instead of a completely opaque token. Relying on producers and consumers to set the token appropriately is error prone, and hiding the fact that the token must be an eventfd_ctx pointer (for all intents and purposes) unnecessarily obfuscates the code and makes it more brittle. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-4-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 7 +++---- virt/lib/irqbypass.c | 44 ++++++++++++++++++++++++++++---------------- 2 files changed, 31 insertions(+), 20 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 11e5d1e3f12e..5bc6abe30748 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -426,15 +426,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (kvm_arch_has_irq_bypass()) { - irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; irqfd->consumer.stop = kvm_arch_irq_bypass_stop; irqfd->consumer.start = kvm_arch_irq_bypass_start; - ret = irq_bypass_register_consumer(&irqfd->consumer); + ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); if (ret) - pr_info("irq bypass consumer (token %p) registration fails: %d\n", - irqfd->consumer.token, ret); + pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", + irqfd->eventfd, ret); } #endif diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 28a4d933569a..e8d7c420db52 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -77,30 +77,32 @@ static void __disconnect(struct irq_bypass_producer *prod, /** * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure + * @eventfd: pointer to the eventfd context associated with the producer * * Add the provided IRQ producer to the list of producers and connect - * with any matching token found on the IRQ consumers list. + * with any matching eventfd found on the IRQ consumers list. */ -int irq_bypass_register_producer(struct irq_bypass_producer *producer) +int irq_bypass_register_producer(struct irq_bypass_producer *producer, + struct eventfd_ctx *eventfd) { struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; int ret; - if (!producer->token) + if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->token == producer->token) { + if (tmp->eventfd == eventfd) { ret = -EBUSY; goto out_err; } } list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { + if (consumer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) goto out_err; @@ -108,6 +110,7 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) } } + producer->eventfd = eventfd; list_add(&producer->node, &producers); mutex_unlock(&lock); @@ -131,26 +134,28 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; - if (!producer->token) + if (!producer->eventfd) return; mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->token != producer->token) + if (tmp->eventfd != producer->eventfd) continue; list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { + if (consumer->eventfd == producer->eventfd) { __disconnect(producer, consumer); break; } } + producer->eventfd = NULL; list_del(&producer->node); break; } + WARN_ON_ONCE(producer->eventfd); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -158,31 +163,35 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); /** * irq_bypass_register_consumer - register IRQ bypass consumer * @consumer: pointer to consumer structure + * @eventfd: pointer to the eventfd context associated with the consumer * * Add the provided IRQ consumer to the list of consumers and connect - * with any matching token found on the IRQ producer list. + * with any matching eventfd found on the IRQ producer list. */ -int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, + struct eventfd_ctx *eventfd) { struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; int ret; - if (!consumer->token || - !consumer->add_producer || !consumer->del_producer) + if (WARN_ON_ONCE(consumer->eventfd)) + return -EINVAL; + + if (!consumer->add_producer || !consumer->del_producer) return -EINVAL; mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { - if (tmp->token == consumer->token || tmp == consumer) { + if (tmp->eventfd == eventfd) { ret = -EBUSY; goto out_err; } } list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { + if (producer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) goto out_err; @@ -190,6 +199,7 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) } } + consumer->eventfd = eventfd; list_add(&consumer->node, &consumers); mutex_unlock(&lock); @@ -213,7 +223,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; - if (!consumer->token) + if (!consumer->eventfd) return; mutex_lock(&lock); @@ -223,16 +233,18 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) continue; list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { + if (producer->eventfd == consumer->eventfd) { __disconnect(producer, consumer); break; } } + consumer->eventfd = NULL; list_del(&consumer->node); break; } + WARN_ON_ONCE(consumer->eventfd); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); -- cgit v1.2.3 From add57f493e0893ac0fb4acbdc441918d3e800f10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:30 -0700 Subject: irqbypass: Explicitly track producer and consumer bindings Explicitly track IRQ bypass producer:consumer bindings. This will allow making removal an O(1) operation; searching through the list to find information that is trivially tracked (and useful for debug) is wasteful. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'virt') diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index e8d7c420db52..fdbf7ecc0c21 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod, if (prod->start) prod->start(prod); + if (!ret) { + prod->consumer = cons; + cons->producer = prod; + } return ret; } @@ -72,6 +76,9 @@ static void __disconnect(struct irq_bypass_producer *prod, cons->start(cons); if (prod->start) prod->start(prod); + + prod->consumer = NULL; + cons->producer = NULL; } /** @@ -145,6 +152,7 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) list_for_each_entry(consumer, &consumers, node) { if (consumer->eventfd == producer->eventfd) { + WARN_ON_ONCE(producer->consumer != consumer); __disconnect(producer, consumer); break; } @@ -234,6 +242,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) list_for_each_entry(producer, &producers, node) { if (producer->eventfd == consumer->eventfd) { + WARN_ON_ONCE(consumer->producer != producer); __disconnect(producer, consumer); break; } -- cgit v1.2.3 From 5d7dbdce388b43cf3a9bc50c4132493de26aeba4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:31 -0700 Subject: irqbypass: Use paired consumer/producer to disconnect during unregister Use the paired consumer/producer information to disconnect IRQ bypass producers/consumers in O(1) time (ignoring the cost of __disconnect()). Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-6-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 48 ++++++++---------------------------------------- 1 file changed, 8 insertions(+), 40 deletions(-) (limited to 'virt') diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index fdbf7ecc0c21..6a183459dc44 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -138,32 +138,16 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer); */ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) { - struct irq_bypass_producer *tmp; - struct irq_bypass_consumer *consumer; - if (!producer->eventfd) return; mutex_lock(&lock); - list_for_each_entry(tmp, &producers, node) { - if (tmp->eventfd != producer->eventfd) - continue; - - list_for_each_entry(consumer, &consumers, node) { - if (consumer->eventfd == producer->eventfd) { - WARN_ON_ONCE(producer->consumer != consumer); - __disconnect(producer, consumer); - break; - } - } - - producer->eventfd = NULL; - list_del(&producer->node); - break; - } + if (producer->consumer) + __disconnect(producer, producer->consumer); - WARN_ON_ONCE(producer->eventfd); + producer->eventfd = NULL; + list_del(&producer->node); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -228,32 +212,16 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); */ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) { - struct irq_bypass_consumer *tmp; - struct irq_bypass_producer *producer; - if (!consumer->eventfd) return; mutex_lock(&lock); - list_for_each_entry(tmp, &consumers, node) { - if (tmp != consumer) - continue; - - list_for_each_entry(producer, &producers, node) { - if (producer->eventfd == consumer->eventfd) { - WARN_ON_ONCE(consumer->producer != producer); - __disconnect(producer, consumer); - break; - } - } - - consumer->eventfd = NULL; - list_del(&consumer->node); - break; - } + if (consumer->producer) + __disconnect(consumer->producer, consumer); - WARN_ON_ONCE(consumer->eventfd); + consumer->eventfd = NULL; + list_del(&consumer->node); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); -- cgit v1.2.3 From 46a4bfd0ae480cabbacc56fe0d8f91cbe229c7ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:32 -0700 Subject: irqbypass: Use guard(mutex) in lieu of manual lock+unlock Use guard(mutex) to clean up irqbypass's error handling. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-7-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) (limited to 'virt') diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 6a183459dc44..828556c081f5 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -99,33 +99,25 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; - mutex_lock(&lock); + guard(mutex)(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->eventfd == eventfd) { - ret = -EBUSY; - goto out_err; - } + if (tmp->eventfd == eventfd) + return -EBUSY; } list_for_each_entry(consumer, &consumers, node) { if (consumer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) - goto out_err; + return ret; break; } } producer->eventfd = eventfd; list_add(&producer->node, &producers); - - mutex_unlock(&lock); - return 0; -out_err: - mutex_unlock(&lock); - return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -141,14 +133,13 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) if (!producer->eventfd) return; - mutex_lock(&lock); + guard(mutex)(&lock); if (producer->consumer) __disconnect(producer, producer->consumer); producer->eventfd = NULL; list_del(&producer->node); - mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -173,33 +164,25 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, if (!consumer->add_producer || !consumer->del_producer) return -EINVAL; - mutex_lock(&lock); + guard(mutex)(&lock); list_for_each_entry(tmp, &consumers, node) { - if (tmp->eventfd == eventfd) { - ret = -EBUSY; - goto out_err; - } + if (tmp->eventfd == eventfd) + return -EBUSY; } list_for_each_entry(producer, &producers, node) { if (producer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) - goto out_err; + return ret; break; } } consumer->eventfd = eventfd; list_add(&consumer->node, &consumers); - - mutex_unlock(&lock); - return 0; -out_err: - mutex_unlock(&lock); - return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -215,13 +198,12 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) if (!consumer->eventfd) return; - mutex_lock(&lock); + guard(mutex)(&lock); if (consumer->producer) __disconnect(consumer->producer, consumer); consumer->eventfd = NULL; list_del(&consumer->node); - mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); -- cgit v1.2.3 From 8394b32faecd9c63b3c436e78e62519e9548e530 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:33 -0700 Subject: irqbypass: Use xarray to track producers and consumers Track IRQ bypass producers and consumers using an xarray to avoid the O(2n) insertion time associated with walking a list to check for duplicate entries, and to search for an partner. At low (tens or few hundreds) total producer/consumer counts, using a list is faster due to the need to allocate backing storage for xarray. But as count creeps into the thousands, xarray wins easily, and can provide several orders of magnitude better latency at high counts. E.g. hundreds of nanoseconds vs. hundreds of milliseconds. Cc: Oliver Upton Cc: David Matlack Cc: Like Xu Cc: Binbin Wu Reported-by: Yong He Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217379 Link: https://lore.kernel.org/all/20230801115646.33990-1-likexu@tencent.com Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-8-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 74 +++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 36 deletions(-) (limited to 'virt') diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 828556c081f5..ea888b9203d2 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -22,8 +22,8 @@ MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IRQ bypass manager utility module"); -static LIST_HEAD(producers); -static LIST_HEAD(consumers); +static DEFINE_XARRAY(producers); +static DEFINE_XARRAY(consumers); static DEFINE_MUTEX(lock); /* @lock must be held when calling connect */ @@ -86,13 +86,13 @@ static void __disconnect(struct irq_bypass_producer *prod, * @producer: pointer to producer structure * @eventfd: pointer to the eventfd context associated with the producer * - * Add the provided IRQ producer to the list of producers and connect - * with any matching eventfd found on the IRQ consumers list. + * Add the provided IRQ producer to the set of producers and connect with the + * consumer with a matching eventfd, if one exists. */ int irq_bypass_register_producer(struct irq_bypass_producer *producer, struct eventfd_ctx *eventfd) { - struct irq_bypass_producer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_consumer *consumer; int ret; @@ -101,22 +101,20 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, guard(mutex)(&lock); - list_for_each_entry(tmp, &producers, node) { - if (tmp->eventfd == eventfd) - return -EBUSY; - } + ret = xa_insert(&producers, index, producer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(consumer, &consumers, node) { - if (consumer->eventfd == eventfd) { - ret = __connect(producer, consumer); - if (ret) - return ret; - break; + consumer = xa_load(&consumers, index); + if (consumer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&producers, index) != producer); + return ret; } } producer->eventfd = eventfd; - list_add(&producer->node, &producers); return 0; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -125,11 +123,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer); * irq_bypass_unregister_producer - unregister IRQ bypass producer * @producer: pointer to producer structure * - * Remove a previously registered IRQ producer from the list of producers - * and disconnect it from any connected IRQ consumer. + * Remove a previously registered IRQ producer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * consumer, if one exists. */ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) { + unsigned long index = (unsigned long)producer->eventfd; + if (!producer->eventfd) return; @@ -138,8 +139,8 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) if (producer->consumer) __disconnect(producer, producer->consumer); + WARN_ON_ONCE(xa_erase(&producers, index) != producer); producer->eventfd = NULL; - list_del(&producer->node); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -148,13 +149,13 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); * @consumer: pointer to consumer structure * @eventfd: pointer to the eventfd context associated with the consumer * - * Add the provided IRQ consumer to the list of consumers and connect - * with any matching eventfd found on the IRQ producer list. + * Add the provided IRQ consumer to the set of consumers and connect with the + * producer with a matching eventfd, if one exists. */ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, struct eventfd_ctx *eventfd) { - struct irq_bypass_consumer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_producer *producer; int ret; @@ -166,22 +167,20 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, guard(mutex)(&lock); - list_for_each_entry(tmp, &consumers, node) { - if (tmp->eventfd == eventfd) - return -EBUSY; - } + ret = xa_insert(&consumers, index, consumer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(producer, &producers, node) { - if (producer->eventfd == eventfd) { - ret = __connect(producer, consumer); - if (ret) - return ret; - break; + producer = xa_load(&producers, index); + if (producer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); + return ret; } } consumer->eventfd = eventfd; - list_add(&consumer->node, &consumers); return 0; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -190,11 +189,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); * irq_bypass_unregister_consumer - unregister IRQ bypass consumer * @consumer: pointer to consumer structure * - * Remove a previously registered IRQ consumer from the list of consumers - * and disconnect it from any connected IRQ producer. + * Remove a previously registered IRQ consumer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * producer, if one exists. */ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) { + unsigned long index = (unsigned long)consumer->eventfd; + if (!consumer->eventfd) return; @@ -203,7 +205,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) if (consumer->producer) __disconnect(consumer->producer, consumer); + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); consumer->eventfd = NULL; - list_del(&consumer->node); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); -- cgit v1.2.3 From 23b54381cee2928e8b5622e654ca4516f30d2f1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:34 -0700 Subject: irqbypass: Require producers to pass in Linux IRQ number during registration Pass in the Linux IRQ associated with an IRQ bypass producer instead of relying on the caller to set the field prior to registration, as there's no benefit to relying on callers to do the right thing. Take care to set producer->irq before __connect(), as KVM expects the IRQ to be valid as soon as a connection is possible. Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20250516230734.2564775-9-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index ea888b9203d2..62c160200be9 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -85,12 +85,13 @@ static void __disconnect(struct irq_bypass_producer *prod, * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure * @eventfd: pointer to the eventfd context associated with the producer + * @irq: Linux IRQ number of the underlying producer device * * Add the provided IRQ producer to the set of producers and connect with the * consumer with a matching eventfd, if one exists. */ int irq_bypass_register_producer(struct irq_bypass_producer *producer, - struct eventfd_ctx *eventfd) + struct eventfd_ctx *eventfd, int irq) { unsigned long index = (unsigned long)eventfd; struct irq_bypass_consumer *consumer; @@ -99,6 +100,8 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; + producer->irq = irq; + guard(mutex)(&lock); ret = xa_insert(&producers, index, producer, GFP_KERNEL); -- cgit v1.2.3 From e295d2e7fbe69ddec772c951c466dfbfc1c96818 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:40 -0700 Subject: KVM: x86: Trigger I/O APIC route rescan in kvm_arch_irq_routing_update() Trigger the I/O APIC route rescan that's performed for a split IRQ chip after userspace updates IRQ routes in kvm_arch_irq_routing_update(), i.e. before dropping kvm->irq_lock. Calling kvm_make_all_cpus_request() under a mutex is perfectly safe, and the smp_wmb()+smp_mb__after_atomic() pair in __kvm_make_request()+kvm_check_request() ensures the new routing is visible to vCPUs prior to the request being visible to vCPUs. In all likelihood, commit b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference") somewhat arbitrarily made the request outside of irq_lock to avoid holding irq_lock any longer than is strictly necessary. And then commit abdb080f7ac8 ("kvm/irqchip: kvm_arch_irq_routing_update renaming split") took the easy route of adding another arch hook instead of risking a functional change. Note, the call to synchronize_srcu_expedited() does NOT provide ordering guarantees with respect to vCPUs scanning the new routing; as above, the request infrastructure provides the necessary ordering. I.e. there's no need to wait for kvm_scan_ioapic_routes() to complete if it's actively running, because regardless of whether it grabs the old or new table, the vCPU will have another KVM_REQ_SCAN_IOAPIC pending, i.e. will rescan again and see the new mappings. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/irqchip.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 162d8ed889f2..6ccabfd32287 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); - kvm_arch_post_irq_routing_update(kvm); - synchronize_srcu_expedited(&kvm->irq_srcu); new = old; -- cgit v1.2.3 From cb210737675ef4c1ad88721e84558eeb2f199312 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:06 -0700 Subject: KVM: Pass new routing entries and irqfd when updating IRTEs When updating IRTEs in response to a GSI routing or IRQ bypass change, pass the new/current routing information along with the associated irqfd. This will allow KVM x86 to harden, simplify, and deduplicate its code. Since adding/removing a bypass producer is now conveniently protected with irqfds.lock, i.e. can't run concurrently with kvm_irq_routing_update(), use the routing information cached in the irqfd instead of looking up the information in the current GSI routing tables. Opportunistically convert an existing printk() to pr_info() and put its string onto a single line (old code that strictly adhered to 80 chars). Link: https://lore.kernel.org/r/20250611224604.313496-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 5bc6abe30748..bd1766da6895 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -285,9 +285,9 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __attribute__((weak)) kvm_arch_update_irqfd_routing( - struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { return 0; } @@ -618,9 +618,8 @@ void kvm_irq_routing_update(struct kvm *kvm) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { - int ret = kvm_arch_update_irqfd_routing( - irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, 1); + int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); + WARN_ON(ret); } #endif -- cgit v1.2.3 From b33252b9d17238a4a9fa5a29af6e6a2922a6c2b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:35 -0700 Subject: KVM: Don't WARN if updating IRQ bypass route fails Don't bother WARNing if updating an IRTE route fails now that vendor code provides much more precise WARNs. The generic WARN doesn't provide enough information to actually debug the problem, and has obviously done nothing to surface the myriad bugs in KVM x86's implementation. Drop all of the associated return code plumbing that existed just so that common KVM could WARN. Link: https://lore.kernel.org/r/20250611224604.313496-34-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index bd1766da6895..719b242fc935 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -285,11 +285,11 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) +void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return 0; + } bool __attribute__((weak)) kvm_arch_irqfd_route_changed( @@ -617,11 +617,8 @@ void kvm_irq_routing_update(struct kvm *kvm) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { - int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); - - WARN_ON(ret); - } + kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) + kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } -- cgit v1.2.3 From 77bb184ab880171a1cedfbed9ab05977e6ae2258 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:36 -0700 Subject: KVM: Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing() Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing(). Calling arch code to know whether or not to call arch code is absurd. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250611224604.313496-35-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 719b242fc935..59b1e64697f1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -291,13 +291,6 @@ void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, { } - -bool __attribute__((weak)) kvm_arch_irqfd_route_changed( - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - return true; -} #endif static int @@ -616,8 +609,7 @@ void kvm_irq_routing_update(struct kvm *kvm) irqfd_update(kvm, irqfd); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) - if (irqfd->producer && - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) + if (irqfd->producer) kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } -- cgit v1.2.3 From 283ed5001d6852f85c09ed2522331b2b197ba937 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:11 -0700 Subject: KVM: Use a local struct to do the initial vfs_poll() on an irqfd Use a function-local struct for the poll_table passed to vfs_poll(), as nothing in the vfs_poll() callchain grabs a long-term reference to the structure, i.e. its lifetime doesn't need to be tied to the irqfd. Using a local structure will also allow propagating failures out of the polling callback without further polluting kvm_kernel_irqfd. Opportunstically rename irqfd_ptable_queue_proc() to kvm_irqfd_register() to capture what it actually does. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 59b1e64697f1..0b655376734e 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -245,12 +245,17 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) return ret; } -static void -irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, - poll_table *pt) +struct kvm_irqfd_pt { + struct kvm_kernel_irqfd *irqfd; + poll_table pt; +}; + +static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) { - struct kvm_kernel_irqfd *irqfd = - container_of(pt, struct kvm_kernel_irqfd, pt); + struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); + struct kvm_kernel_irqfd *irqfd = p->irqfd; + add_wait_queue_priority(wqh, &irqfd->wait); } @@ -298,6 +303,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { struct kvm_kernel_irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct kvm_irqfd_pt irqfd_pt; int ret; __poll_t events; int idx; @@ -387,7 +393,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) * a callback whenever someone signals the underlying eventfd */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); spin_lock_irq(&kvm->irqfds.lock); @@ -409,11 +414,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) spin_unlock_irq(&kvm->irqfds.lock); /* - * Check if there was an event already pending on the eventfd - * before we registered, and trigger it as if we didn't miss it. + * Register the irqfd with the eventfd by polling on the eventfd. If + * there was en event pending on the eventfd prior to registering, + * manually trigger IRQ injection. */ - events = vfs_poll(fd_file(f), &irqfd->pt); + irqfd_pt.irqfd = irqfd; + init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); + events = vfs_poll(fd_file(f), &irqfd_pt.pt); if (events & EPOLLIN) schedule_work(&irqfd->inject); -- cgit v1.2.3 From 140768a7bf03df2a746cdbd4b6dc938d80caad8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:12 -0700 Subject: KVM: Acquire SCRU lock outside of irqfds.lock during assignment Acquire SRCU outside of irqfds.lock so that the locking is symmetrical, and add a comment explaining why on earth KVM holds SRCU for so long. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 0b655376734e..614f81cd37c1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -394,6 +394,18 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + /* + * Set the irqfd routing and add it to KVM's list before registering + * the irqfd with the eventfd, so that the routing information is valid + * and stays valid, e.g. if there are GSI routing changes, prior to + * making the irqfd visible, i.e. before it might be signaled. + * + * Note, holding SRCU ensures a stable read of routing information, and + * also prevents irqfd_shutdown() from freeing the irqfd before it's + * fully initialized. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + spin_lock_irq(&kvm->irqfds.lock); ret = 0; @@ -402,11 +414,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) continue; /* This fd is used for another irq already. */ ret = -EBUSY; - spin_unlock_irq(&kvm->irqfds.lock); - goto fail; + goto fail_duplicate; } - idx = srcu_read_lock(&kvm->irq_srcu); irqfd_update(kvm, irqfd); list_add_tail(&irqfd->list, &kvm->irqfds.items); @@ -441,6 +451,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) srcu_read_unlock(&kvm->irq_srcu, idx); return 0; +fail_duplicate: + spin_unlock_irq(&kvm->irqfds.lock); + srcu_read_unlock(&kvm->irq_srcu, idx); fail: if (irqfd->resampler) irqfd_resampler_shutdown(irqfd); -- cgit v1.2.3 From b5c543518ae9df8e99c63cd08a8b573f0141b31a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:13 -0700 Subject: KVM: Initialize irqfd waitqueue callback when adding to the queue Initialize the irqfd waitqueue callback immediately prior to inserting the irqfd into the eventfd's waitqueue. Pre-initializing the state in a completely different context is all kinds of confusing, and incorrectly suggests that the waitqueue function needs to be initialize prior to vfs_poll(). Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-4-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 614f81cd37c1..7b7c5269cf18 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -256,6 +256,13 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); struct kvm_kernel_irqfd *irqfd = p->irqfd; + /* + * Add the irqfd as a priority waiter on the eventfd, with a custom + * wake-up handler, so that KVM *and only KVM* is notified whenever the + * underlying eventfd is signaled. + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + add_wait_queue_priority(wqh, &irqfd->wait); } @@ -388,12 +395,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) mutex_unlock(&kvm->irqfds.resampler_lock); } - /* - * Install our own custom wake-up handling so we are notified via - * a callback whenever someone signals the underlying eventfd - */ - init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - /* * Set the irqfd routing and add it to KVM's list before registering * the irqfd with the eventfd, so that the routing information is valid -- cgit v1.2.3 From 5f8ca05ea99183ab2b69c7fd9617961211d194e7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:14 -0700 Subject: KVM: Add irqfd to KVM's list via the vfs_poll() callback Add the irqfd structure to KVM's list of irqfds in kvm_irqfd_register(), i.e. via the vfs_poll() callback. This will allow taking irqfds.lock across the entire registration sequence (add to waitqueue, add to list), and more importantly will allow inserting into KVM's list if and only if adding to the waitqueue succeeds (spoiler alert), without needing to juggle return codes in weird ways. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 98 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 43 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 7b7c5269cf18..170a67aad983 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -245,9 +245,32 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) return ret; } +static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + int n_entries; + + lockdep_assert_held(&kvm->irqfds.lock); + + n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); + + write_seqcount_begin(&irqfd->irq_entry_sc); + + e = entries; + if (n_entries == 1) + irqfd->irq_entry = *e; + else + irqfd->irq_entry.type = 0; + + write_seqcount_end(&irqfd->irq_entry_sc); +} + struct kvm_irqfd_pt { struct kvm_kernel_irqfd *irqfd; + struct kvm *kvm; poll_table pt; + int ret; }; static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, @@ -255,6 +278,25 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, { struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); struct kvm_kernel_irqfd *irqfd = p->irqfd; + struct kvm_kernel_irqfd *tmp; + struct kvm *kvm = p->kvm; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + /* This fd is used for another irq already. */ + p->ret = -EBUSY; + spin_unlock_irq(&kvm->irqfds.lock); + return; + } + + irqfd_update(kvm, irqfd); + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + + spin_unlock_irq(&kvm->irqfds.lock); /* * Add the irqfd as a priority waiter on the eventfd, with a custom @@ -264,26 +306,7 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); add_wait_queue_priority(wqh, &irqfd->wait); -} - -/* Must be called under irqfds.lock */ -static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - int n_entries; - - n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); - - write_seqcount_begin(&irqfd->irq_entry_sc); - - e = entries; - if (n_entries == 1) - irqfd->irq_entry = *e; - else - irqfd->irq_entry.type = 0; - - write_seqcount_end(&irqfd->irq_entry_sc); + p->ret = 0; } #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) @@ -308,7 +331,7 @@ void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct kvm_kernel_irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; struct kvm_irqfd_pt irqfd_pt; int ret; @@ -407,32 +430,22 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) */ idx = srcu_read_lock(&kvm->irq_srcu); - spin_lock_irq(&kvm->irqfds.lock); - - ret = 0; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - ret = -EBUSY; - goto fail_duplicate; - } - - irqfd_update(kvm, irqfd); - - list_add_tail(&irqfd->list, &kvm->irqfds.items); - - spin_unlock_irq(&kvm->irqfds.lock); - /* - * Register the irqfd with the eventfd by polling on the eventfd. If - * there was en event pending on the eventfd prior to registering, - * manually trigger IRQ injection. + * Register the irqfd with the eventfd by polling on the eventfd, and + * simultaneously and the irqfd to KVM's list. If there was en event + * pending on the eventfd prior to registering, manually trigger IRQ + * injection. */ irqfd_pt.irqfd = irqfd; + irqfd_pt.kvm = kvm; init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); events = vfs_poll(fd_file(f), &irqfd_pt.pt); + + ret = irqfd_pt.ret; + if (ret) + goto fail_poll; + if (events & EPOLLIN) schedule_work(&irqfd->inject); @@ -452,8 +465,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) srcu_read_unlock(&kvm->irq_srcu, idx); return 0; -fail_duplicate: - spin_unlock_irq(&kvm->irqfds.lock); +fail_poll: srcu_read_unlock(&kvm->irq_srcu, idx); fail: if (irqfd->resampler) -- cgit v1.2.3 From 86e00cd162a727c0b847def89bbd787c20eb8f5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:15 -0700 Subject: KVM: Add irqfd to eventfd's waitqueue while holding irqfds.lock Add an irqfd to its target eventfd's waitqueue while holding irqfds.lock, which is mildly terrifying but functionally safe. irqfds.lock is taken inside the waitqueue's lock, but if and only if the eventfd is being released, i.e. that path is mutually exclusive with registration as KVM holds a reference to the eventfd (and obviously must do so to avoid UAF). This will allow using the eventfd's waitqueue to enforce KVM's requirement that eventfd is assigned to at most one irqfd, without introducing races. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-6-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 170a67aad983..a9a13f919de8 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -204,6 +204,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) int ret = 0; if (flags & EPOLLIN) { + /* + * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, + * as KVM holds irqfds.lock when registering the irqfd with the + * eventfd. + */ u64 cnt; eventfd_ctx_do_read(irqfd->eventfd, &cnt); @@ -225,6 +230,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) /* The eventfd is closing, detach from KVM */ unsigned long iflags; + /* + * Taking irqfds.lock is safe here, as KVM holds a reference to + * the eventfd when registering the irqfd, i.e. this path can't + * be reached while kvm_irqfd_add() is running. + */ spin_lock_irqsave(&kvm->irqfds.lock, iflags); /* @@ -296,16 +306,21 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, list_add_tail(&irqfd->list, &kvm->irqfds.items); - spin_unlock_irq(&kvm->irqfds.lock); - /* * Add the irqfd as a priority waiter on the eventfd, with a custom * wake-up handler, so that KVM *and only KVM* is notified whenever the - * underlying eventfd is signaled. + * underlying eventfd is signaled. Temporarily lie to lockdep about + * holding irqfds.lock to avoid a false positive regarding potential + * deadlock with irqfd_wakeup() (see irqfd_wakeup() for details). */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); add_wait_queue_priority(wqh, &irqfd->wait); + spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); + + spin_unlock_irq(&kvm->irqfds.lock); + p->ret = 0; } -- cgit v1.2.3 From 867347bb21e18551b48eb147c0b2d8635909c8b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:16 -0700 Subject: sched/wait: Drop WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() Drop the setting of WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() and instead have callers manually add the flag prior to adding their structure to the queue. Blindly setting WQ_FLAG_EXCLUSIVE is flawed, as the nature of exclusive, priority waiters means that only the first waiter added will ever receive notifications. Pushing the flawed behavior to callers will allow fixing the problem one hypervisor at a time (KVM added the flawed API, and then KVM's code was copy+pasted nearly verbatim by Xen and Hyper-V), and will also allow for adding an API that provides true exclusivity, i.e. that guarantees at most one priority waiter is in the queue. Opportunistically add a comment in Hyper-V to call out the mess. Xen privcmd's irqfd_wakefup() doesn't actually operate in exclusive mode, i.e. can be "fixed" simply by dropping WQ_FLAG_EXCLUSIVE. And KVM is primed to switch to the aforementioned fully exclusive API, i.e. won't be carrying the flawed code for long. No functional change intended. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-7-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a9a13f919de8..f8c2486f95d5 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -316,6 +316,7 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); + irqfd->wait.flags |= WQ_FLAG_EXCLUSIVE; add_wait_queue_priority(wqh, &irqfd->wait); spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); -- cgit v1.2.3 From 2cdd64cbf9906f9d2d52ef96e1471992ff6c27ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:19 -0700 Subject: KVM: Disallow binding multiple irqfds to an eventfd with a priority waiter Disallow binding an irqfd to an eventfd that already has a priority waiter, i.e. to an eventfd that already has an attached irqfd. KVM always operates in exclusive mode for EPOLL_IN (unconditionally returns '1'), i.e. only the first waiter will be notified. KVM already disallows binding multiple irqfds to an eventfd in a single VM, but doesn't guard against multiple VMs binding to an eventfd. Adding the extra protection reduces the pain of a userspace VMM bug, e.g. if userspace fails to de-assign before re-assigning when transferring state for intra-host migration, then the migration will explicitly fail as opposed to dropping IRQs on the destination VM. Temporarily keep KVM's manual check on irqfds.items, but add a WARN, e.g. to allow sanity checking the waitqueue enforcement. Cc: Oliver Upton Cc: David Matlack Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-10-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 55 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 18 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f8c2486f95d5..ab9e3cfb81bf 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -291,38 +291,57 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, struct kvm_kernel_irqfd *tmp; struct kvm *kvm = p->kvm; + /* + * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, + * and irqfds.items. It does NOT protect registering with the eventfd. + */ spin_lock_irq(&kvm->irqfds.lock); - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - p->ret = -EBUSY; - spin_unlock_irq(&kvm->irqfds.lock); - return; - } - + /* + * Initialize the routing information prior to adding the irqfd to the + * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the + * irqfd is registered. + */ irqfd_update(kvm, irqfd); - list_add_tail(&irqfd->list, &kvm->irqfds.items); - /* * Add the irqfd as a priority waiter on the eventfd, with a custom * wake-up handler, so that KVM *and only KVM* is notified whenever the - * underlying eventfd is signaled. Temporarily lie to lockdep about - * holding irqfds.lock to avoid a false positive regarding potential - * deadlock with irqfd_wakeup() (see irqfd_wakeup() for details). + * underlying eventfd is signaled. */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + /* + * Temporarily lie to lockdep about holding irqfds.lock to avoid a + * false positive regarding potential deadlock with irqfd_wakeup() + * (see irqfd_wakeup() for details). + * + * Adding to the wait queue will fail if there is already a priority + * waiter, i.e. if the eventfd is associated with another irqfd (in any + * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown + * jobs to complete, i.e. ensures the irqfd has been removed from the + * eventfd's waitqueue before returning to userspace. + */ spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); - irqfd->wait.flags |= WQ_FLAG_EXCLUSIVE; - add_wait_queue_priority(wqh, &irqfd->wait); + p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); + if (p->ret) + goto out; - spin_unlock_irq(&kvm->irqfds.lock); + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; - p->ret = 0; + WARN_ON_ONCE(1); + /* This fd is used for another irq already. */ + p->ret = -EBUSY; + goto out; + } + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + +out: + spin_unlock_irq(&kvm->irqfds.lock); } #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) -- cgit v1.2.3 From b599d44a71f1aa1acff54b05e4c0e60ea82ed90c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:20 -0700 Subject: KVM: Drop sanity check that per-VM list of irqfds is unique Now that the eventfd's waitqueue ensures it has at most one priority waiter, i.e. prevents KVM from binding multiple irqfds to one eventfd, drop KVM's sanity check that eventfds are unique for a single VM. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-11-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index ab9e3cfb81bf..6b1133a6617f 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -288,7 +288,6 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, { struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); struct kvm_kernel_irqfd *irqfd = p->irqfd; - struct kvm_kernel_irqfd *tmp; struct kvm *kvm = p->kvm; /* @@ -328,16 +327,6 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, if (p->ret) goto out; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - - WARN_ON_ONCE(1); - /* This fd is used for another irq already. */ - p->ret = -EBUSY; - goto out; - } - list_add_tail(&irqfd->list, &kvm->irqfds.items); out: -- cgit v1.2.3