Merge tag 'for-6.10/io_uring-20240511' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe: - Greatly improve send zerocopy performance, by enabling coalescing of sent buffers. MSG_ZEROCOPY already does this with send(2) and sendmsg(2), but the io_uring side did not. In local testing, the crossover point for send zerocopy being faster is now around 3000 byte packets, and it performs better than the sync syscall variants as well. This feature relies on a shared branch with net-next, which was pulled into both branches. - Unification of how async preparation is done across opcodes. Previously, opcodes that required extra memory for async retry would allocate that as needed, using on-stack state until that was the case. If async retry was needed, the on-stack state was adjusted appropriately for a retry and then copied to the allocated memory. This led to some fragile and ugly code, particularly for read/write handling, and made storage retries more difficult than they needed to be. Allocate the memory upfront, as it's cheap from our pools, and use that state consistently both initially and also from the retry side. - Move away from using remap_pfn_range() for mapping the rings. This is really not the right interface to use and can cause lifetime issues or leaks. Additionally, it means the ring sq/cq arrays need to be physically contigious, which can cause problems in production with larger rings when services are restarted, as memory can be very fragmented at that point. Move to using vm_insert_page(s) for the ring sq/cq arrays, and apply the same treatment to mapped ring provided buffers. This also helps unify the code we have dealing with allocating and mapping memory. Hard to see in the diffstat as we're adding a few features as well, but this kills about ~400 lines of code from the codebase as well. - Add support for bundles for send/recv. When used with provided buffers, bundles support sending or receiving more than one buffer at the time, improving the efficiency by only needing to call into the networking stack once for multiple sends or receives. - Tweaks for our accept operations, supporting both a DONTWAIT flag for skipping poll arm and retry if we can, and a POLLFIRST flag that the application can use to skip the initial accept attempt and rely purely on poll for triggering the operation. Both of these have identical flags on the receive side already. - Make the task_work ctx locking unconditional. We had various code paths here that would do a mix of lock/trylock and set the task_work state to whether or not it was locked. All of that goes away, we lock it unconditionally and get rid of the state flag indicating whether it's locked or not. The state struct still exists as an empty type, can go away in the future. - Add support for specifying NOP completion values, allowing it to be used for error handling testing. - Use set/test bit for io-wq worker flags. Not strictly needed, but also doesn't hurt and helps silence a KCSAN warning. - Cleanups for io-wq locking and work assignments, closing a tiny race where cancelations would not be able to find the work item reliably. - Misc fixes, cleanups, and improvements * tag 'for-6.10/io_uring-20240511' of git://git.kernel.dk/linux: (97 commits) io_uring: support to inject result for NOP io_uring: fail NOP if non-zero op flags is passed in io_uring/net: add IORING_ACCEPT_POLL_FIRST flag io_uring/net: add IORING_ACCEPT_DONTWAIT flag io_uring/filetable: don't unnecessarily clear/reset bitmap io_uring/io-wq: Use set_bit() and test_bit() at worker->flags io_uring/msg_ring: cleanup posting to IOPOLL vs !IOPOLL ring io_uring: Require zeroed sqe->len on provided-buffers send io_uring/notif: disable LAZY_WAKE for linked notifs io_uring/net: fix sendzc lazy wake polling io_uring/msg_ring: reuse ctx->submitter_task read using READ_ONCE instead of re-reading it io_uring/rw: reinstate thread check for retries io_uring/notif: implement notification stacking io_uring/notif: simplify io_notif_flush() net: add callback for setting a ubuf_info to skb net: extend ubuf_info callback to ops structure io_uring/net: support bundles for recv io_uring/net: support bundles for send io_uring/kbuf: add helpers for getting/peeking multiple buffers io_uring/net: add provided buffer support for IORING_OP_SEND ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-05-13 12:48:06 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-05-13 12:48:06 -0700
commit: 9961a785944601e32f185ea696347b22ffda634c (patch)
tree: 24362a6360b2a1fa96c7eb562c6a95f96b37e459 /io_uring
parent: f4e8d80292859809ea135e9f4c43bae47e4f58bc (diff)
parent: deb1e496a83557896fe0cca0b8af01c2a97c0dc6 (diff)
download: linux-9961a785944601e32f185ea696347b22ffda634c.tar.gz
linux-9961a785944601e32f185ea696347b22ffda634c.tar.bz2
linux-9961a785944601e32f185ea696347b22ffda634c.zip
35 files changed, 1898 insertions, 1698 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 2e1d4e03799c..fc1b23c524e8 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,13 +2,14 @@
 #
 # Makefile for io_uring
 
-obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
-					sync.o advise.o filetable.o \
-					openclose.o uring_cmd.o epoll.o \
-					statx.o net.o msg_ring.o timeout.o \
-					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o opdef.o \
-					notif.o waitid.o register.o truncate.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
+					tctx.o filetable.o rw.o net.o poll.o \
+					uring_cmd.o openclose.o sqpoll.o \
+					xattr.o nop.o fs.o splice.o sync.o \
+					msg_ring.o advise.o openclose.o \
+					epoll.o statx.o timeout.o fdinfo.o \
+					cancel.o waitid.o register.o \
+					truncate.o memmap.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FUTEX)		+= futex.o
 obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index bf2fb26a6539..b7a38a2069cf 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -4,63 +4,58 @@
 /*
  * Don't allow the cache to grow beyond this size.
  */
-#define IO_ALLOC_CACHE_MAX	512
-
-struct io_cache_entry {
-	struct io_wq_work_node node;
-};
+#define IO_ALLOC_CACHE_MAX	128
 
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
-				      struct io_cache_entry *entry)
+				      void *entry)
 {
 	if (cache->nr_cached < cache->max_cached) {
-		cache->nr_cached++;
-		wq_stack_add_head(&entry->node, &cache->list);
-		kasan_mempool_poison_object(entry);
+		if (!kasan_mempool_poison_object(entry))
+			return false;
+		cache->entries[cache->nr_cached++] = entry;
 		return true;
 	}
 	return false;
 }
 
-static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
-{
-	return !cache->list.next;
-}
-
-static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
+static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
 {
-	if (cache->list.next) {
-		struct io_cache_entry *entry;
+	if (cache->nr_cached) {
+		void *entry = cache->entries[--cache->nr_cached];
 
-		entry = container_of(cache->list.next, struct io_cache_entry, node);
 		kasan_mempool_unpoison_object(entry, cache->elem_size);
-		cache->list.next = cache->list.next->next;
-		cache->nr_cached--;
 		return entry;
 	}
 
 	return NULL;
 }
 
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
+/* returns false if the cache was initialized properly */
+static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
 				       unsigned max_nr, size_t size)
 {
-	cache->list.next = NULL;
-	cache->nr_cached = 0;
-	cache->max_cached = max_nr;
-	cache->elem_size = size;
+	cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL);
+	if (cache->entries) {
+		cache->nr_cached = 0;
+		cache->max_cached = max_nr;
+		cache->elem_size = size;
+		return false;
+	}
+	return true;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
-					void (*free)(struct io_cache_entry *))
+				       void (*free)(const void *))
 {
-	while (1) {
-		struct io_cache_entry *entry = io_alloc_cache_get(cache);
+	void *entry;
+
+	if (!cache->entries)
+		return;
 
-		if (!entry)
-			break;
+	while ((entry = io_alloc_cache_get(cache)) != NULL)
 		free(entry);
-	}
-	cache->nr_cached = 0;
+
+	kvfree(cache->entries);
+	cache->entries = NULL;
 }
 #endif
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index acfcdd7f059a..a6e58a20efdd 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -184,9 +184,7 @@ static int __io_async_cancel(struct io_cancel_data *cd,
 	io_ring_submit_lock(ctx, issue_flags);
 	ret = -ENOENT;
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
-		struct io_uring_task *tctx = node->task->io_uring;
-
-		ret = io_async_cancel_one(tctx, cd);
+		ret = io_async_cancel_one(node->task->io_uring, cd);
 		if (ret != -ENOENT) {
 			if (!all)
 				break;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 8d444dd1b0a7..b1e0e0d85349 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -50,9 +50,9 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
  * Caller holds a reference to the file already, we don't need to do
  * anything else to get an extra reference.
  */
-__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 {
-	struct io_ring_ctx *ctx = f->private_data;
+	struct io_ring_ctx *ctx = file->private_data;
 	struct io_overflow_cqe *ocqe;
 	struct io_rings *r = ctx->rings;
 	struct rusage sq_usage;
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 6e86e6188dbe..997c56d32ee6 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -84,12 +84,12 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 			return ret;
 
 		file_slot->file_ptr = 0;
-		io_file_bitmap_clear(&ctx->file_table, slot_index);
+	} else {
+		io_file_bitmap_set(&ctx->file_table, slot_index);
 	}
 
 	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
 	io_fixed_file_set(file_slot, file);
-	io_file_bitmap_set(&ctx->file_table, slot_index);
 	return 0;
 }
 
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 792a03df58de..914848f46beb 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -9,7 +9,7 @@
 
 #include "../kernel/futex/futex.h"
 #include "io_uring.h"
-#include "rsrc.h"
+#include "alloc_cache.h"
 #include "futex.h"
 
 struct io_futex {
@@ -27,27 +27,21 @@ struct io_futex {
 };
 
 struct io_futex_data {
-	union {
-		struct futex_q		q;
-		struct io_cache_entry	cache;
-	};
+	struct futex_q	q;
 	struct io_kiocb	*req;
 };
 
-void io_futex_cache_init(struct io_ring_ctx *ctx)
-{
-	io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
-				sizeof(struct io_futex_data));
-}
+#define IO_FUTEX_ALLOC_CACHE_MAX	32
 
-static void io_futex_cache_entry_free(struct io_cache_entry *entry)
+bool io_futex_cache_init(struct io_ring_ctx *ctx)
 {
-	kfree(container_of(entry, struct io_futex_data, cache));
+	return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX,
+				sizeof(struct io_futex_data));
 }
 
 void io_futex_cache_free(struct io_ring_ctx *ctx)
 {
-	io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
+	io_alloc_cache_free(&ctx->futex_cache, kfree);
 }
 
 static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
@@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_tw_lock(ctx, ts);
-	if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
+	if (!io_alloc_cache_put(&ctx->futex_cache, ifd))
 		kfree(ifd);
 	__io_futex_complete(req, ts);
 }
@@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
 
 static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
 {
-	struct io_cache_entry *entry;
+	struct io_futex_data *ifd;
 
-	entry = io_alloc_cache_get(&ctx->futex_cache);
-	if (entry)
-		return container_of(entry, struct io_futex_data, cache);
+	ifd = io_alloc_cache_get(&ctx->futex_cache);
+	if (ifd)
+		return ifd;
 
 	return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
 }
diff --git a/io_uring/futex.h b/io_uring/futex.h
index 0847e9e8a127..b8bb09873d57 100644
--- a/io_uring/futex.h
+++ b/io_uring/futex.h
@@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 		    unsigned int issue_flags);
 bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
 			 bool cancel_all);
-void io_futex_cache_init(struct io_ring_ctx *ctx);
+bool io_futex_cache_init(struct io_ring_ctx *ctx);
 void io_futex_cache_free(struct io_ring_ctx *ctx);
 #else
 static inline int io_futex_cancel(struct io_ring_ctx *ctx,
@@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
 {
 	return false;
 }
-static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
+static inline bool io_futex_cache_init(struct io_ring_ctx *ctx)
 {
+	return false;
 }
 static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
 {
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 522196dfb0ff..d1c47a9d9215 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -25,10 +25,10 @@
 #define WORKER_IDLE_TIMEOUT	(5 * HZ)
 
 enum {
-	IO_WORKER_F_UP		= 1,	/* up and active */
-	IO_WORKER_F_RUNNING	= 2,	/* account as running */
-	IO_WORKER_F_FREE	= 4,	/* worker on free list */
-	IO_WORKER_F_BOUND	= 8,	/* is doing bounded work */
+	IO_WORKER_F_UP		= 0,	/* up and active */
+	IO_WORKER_F_RUNNING	= 1,	/* account as running */
+	IO_WORKER_F_FREE	= 2,	/* worker on free list */
+	IO_WORKER_F_BOUND	= 3,	/* is doing bounded work */
 };
 
 enum {
@@ -44,21 +44,20 @@ enum {
  */
 struct io_worker {
 	refcount_t ref;
-	unsigned flags;
+	int create_index;
+	unsigned long flags;
 	struct hlist_nulls_node nulls_node;
 	struct list_head all_list;
 	struct task_struct *task;
 	struct io_wq *wq;
 
 	struct io_wq_work *cur_work;
-	struct io_wq_work *next_work;
 	raw_spinlock_t lock;
 
 	struct completion ref_done;
 
 	unsigned long create_state;
 	struct callback_head create_work;
-	int create_index;
 
 	union {
 		struct rcu_head rcu;
@@ -165,7 +164,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
 
 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
 {
-	return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND);
+	return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags));
 }
 
 static void io_worker_ref_put(struct io_wq *wq)
@@ -225,7 +224,7 @@ static void io_worker_exit(struct io_worker *worker)
 	wait_for_completion(&worker->ref_done);
 
 	raw_spin_lock(&wq->lock);
-	if (worker->flags & IO_WORKER_F_FREE)
+	if (test_bit(IO_WORKER_F_FREE, &worker->flags))
 		hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
 	raw_spin_unlock(&wq->lock);
@@ -410,7 +409,7 @@ static void io_wq_dec_running(struct io_worker *worker)
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
 	struct io_wq *wq = worker->wq;
 
-	if (!(worker->flags & IO_WORKER_F_UP))
+	if (!test_bit(IO_WORKER_F_UP, &worker->flags))
 		return;
 
 	if (!atomic_dec_and_test(&acct->nr_running))
@@ -430,8 +429,8 @@ static void io_wq_dec_running(struct io_worker *worker)
  */
 static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
 {
-	if (worker->flags & IO_WORKER_F_FREE) {
-		worker->flags &= ~IO_WORKER_F_FREE;
+	if (test_bit(IO_WORKER_F_FREE, &worker->flags)) {
+		clear_bit(IO_WORKER_F_FREE, &worker->flags);
 		raw_spin_lock(&wq->lock);
 		hlist_nulls_del_init_rcu(&worker->nulls_node);
 		raw_spin_unlock(&wq->lock);
@@ -444,8 +443,8 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
 static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
 	__must_hold(wq->lock)
 {
-	if (!(worker->flags & IO_WORKER_F_FREE)) {
-		worker->flags |= IO_WORKER_F_FREE;
+	if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) {
+		set_bit(IO_WORKER_F_FREE, &worker->flags);
 		hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
 	}
 }
@@ -539,7 +538,6 @@ static void io_assign_current_work(struct io_worker *worker,
 
 	raw_spin_lock(&worker->lock);
 	worker->cur_work = work;
-	worker->next_work = NULL;
 	raw_spin_unlock(&worker->lock);
 }
 
@@ -564,10 +562,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 		 * clear the stalled flag.
 		 */
 		work = io_get_next_work(acct, worker);
-		raw_spin_unlock(&acct->lock);
 		if (work) {
-			__io_worker_busy(wq, worker);
-
 			/*
 			 * Make sure cancelation can find this, even before
 			 * it becomes the active work. That avoids a window
@@ -576,11 +571,17 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 			 * current work item for this worker.
 			 */
 			raw_spin_lock(&worker->lock);
-			worker->next_work = work;
+			worker->cur_work = work;
 			raw_spin_unlock(&worker->lock);
-		} else {
-			break;
 		}
+
+		raw_spin_unlock(&acct->lock);
+
+		if (!work)
+			break;
+
+		__io_worker_busy(wq, worker);
+
 		io_assign_current_work(worker, work);
 		__set_current_state(TASK_RUNNING);
 
@@ -631,7 +632,8 @@ static int io_wq_worker(void *data)
 	bool exit_mask = false, last_timeout = false;
 	char buf[TASK_COMM_LEN];
 
-	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+	set_mask_bits(&worker->flags, 0,
+		      BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING));
 
 	snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
 	set_task_comm(current, buf);
@@ -695,11 +697,11 @@ void io_wq_worker_running(struct task_struct *tsk)
 
 	if (!worker)
 		return;
-	if (!(worker->flags & IO_WORKER_F_UP))
+	if (!test_bit(IO_WORKER_F_UP, &worker->flags))
 		return;
-	if (worker->flags & IO_WORKER_F_RUNNING)
+	if (test_bit(IO_WORKER_F_RUNNING, &worker->flags))
 		return;
-	worker->flags |= IO_WORKER_F_RUNNING;
+	set_bit(IO_WORKER_F_RUNNING, &worker->flags);
 	io_wq_inc_running(worker);
 }
 
@@ -713,12 +715,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
 
 	if (!worker)
 		return;
-	if (!(worker->flags & IO_WORKER_F_UP))
+	if (!test_bit(IO_WORKER_F_UP, &worker->flags))
 		return;
-	if (!(worker->flags & IO_WORKER_F_RUNNING))
+	if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags))
 		return;
 
-	worker->flags &= ~IO_WORKER_F_RUNNING;
+	clear_bit(IO_WORKER_F_RUNNING, &worker->flags);
 	io_wq_dec_running(worker);
 }
 
@@ -732,7 +734,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker,
 	raw_spin_lock(&wq->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
 	list_add_tail_rcu(&worker->all_list, &wq->all_list);
-	worker->flags |= IO_WORKER_F_FREE;
+	set_bit(IO_WORKER_F_FREE, &worker->flags);
 	raw_spin_unlock(&wq->lock);
 	wake_up_new_task(tsk);
 }
@@ -838,7 +840,7 @@ fail:
 	init_completion(&worker->ref_done);
 
 	if (index == IO_WQ_ACCT_BOUND)
-		worker->flags |= IO_WORKER_F_BOUND;
+		set_bit(IO_WORKER_F_BOUND, &worker->flags);
 
 	tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
 	if (!IS_ERR(tsk)) {
@@ -924,8 +926,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 {
 	struct io_wq_acct *acct = io_work_get_acct(wq, work);
+	unsigned long work_flags = work->flags;
 	struct io_cb_cancel_data match;
-	unsigned work_flags = work->flags;
 	bool do_create;
 
 	/*
@@ -1005,8 +1007,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 	 * may dereference the passed in work.
 	 */
 	raw_spin_lock(&worker->lock);
-	if (__io_wq_worker_cancel(worker, match, worker->cur_work) ||
-	    __io_wq_worker_cancel(worker, match, worker->next_work))
+	if (__io_wq_worker_cancel(worker, match, worker->cur_work))
 		match->nr_running++;
 	raw_spin_unlock(&worker->lock);
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 380b9ce1d301..86fd72f6a1c2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -63,7 +63,6 @@
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
-#include <linux/highmem.h>
 #include <linux/fsnotify.h>
 #include <linux/fadvise.h>
 #include <linux/task_work.h>
@@ -95,6 +94,8 @@
 #include "waitid.h"
 #include "futex.h"
 #include "napi.h"
+#include "uring_cmd.h"
+#include "memmap.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -170,17 +171,9 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{},
 };
 #endif
 
-static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
-{
-	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
-	    ctx->submit_state.cqes_count)
-		__io_submit_flush_completions(ctx);
-}
-
 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 {
 	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
@@ -253,14 +246,12 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 						fallback_work.work);
 	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 	struct io_kiocb *req, *tmp;
-	struct io_tw_state ts = { .locked = true, };
+	struct io_tw_state ts = {};
 
 	percpu_ref_get(&ctx->refs);
 	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
 		req->io_task_work.func(req, &ts);
-	if (WARN_ON_ONCE(!ts.locked))
-		return;
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
@@ -284,6 +275,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
 	int hash_bits;
+	bool ret;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -312,14 +304,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	INIT_HLIST_HEAD(&ctx->io_buf_list);
-	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+	ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
 			    sizeof(struct io_rsrc_node));
-	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
 			    sizeof(struct async_poll));
-	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
-	io_futex_cache_init(ctx);
+	ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct io_async_rw));
+	ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct uring_cache));
+	ret |= io_futex_cache_init(ctx);
+	if (ret)
+		goto err;
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -337,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
-	INIT_WQ_LIST(&ctx->locked_free_list);
 	INIT_HLIST_HEAD(&ctx->waitid_list);
 #ifdef CONFIG_FUTEX
 	INIT_HLIST_HEAD(&ctx->futex_list);
@@ -349,6 +345,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 
 	return ctx;
 err:
+	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
+	io_alloc_cache_free(&ctx->apoll_cache, kfree);
+	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
+	io_alloc_cache_free(&ctx->uring_cache, kfree);
+	io_futex_cache_free(ctx);
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
 	xa_destroy(&ctx->io_bl_xa);
@@ -379,7 +381,7 @@ static void io_clean_op(struct io_kiocb *req)
 {
 	if (req->flags & REQ_F_BUFFER_SELECTED) {
 		spin_lock(&req->ctx->completion_lock);
-		io_put_kbuf_comp(req);
+		io_kbuf_drop(req);
 		spin_unlock(&req->ctx->completion_lock);
 	}
 
@@ -498,7 +500,7 @@ static void io_prep_async_link(struct io_kiocb *req)
 	}
 }
 
-void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use)
+static void io_queue_iowq(struct io_kiocb *req)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
@@ -666,28 +668,14 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	io_commit_cqring_flush(ctx);
 }
 
-static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
-{
-	struct io_overflow_cqe *ocqe;
-	LIST_HEAD(list);
-
-	spin_lock(&ctx->completion_lock);
-	list_splice_init(&ctx->cq_overflow_list, &list);
-	clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
-	spin_unlock(&ctx->completion_lock);
-
-	while (!list_empty(&list)) {
-		ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
-		list_del(&ocqe->list);
-		kfree(ocqe);
-	}
-}
-
-static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
+static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
 {
 	size_t cqe_size = sizeof(struct io_uring_cqe);
 
-	if (__io_cqring_events(ctx) == ctx->cq_entries)
+	lockdep_assert_held(&ctx->uring_lock);
+
+	/* don't abort if we're dying, entries must get freed */
+	if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
 		return;
 
 	if (ctx->flags & IORING_SETUP_CQE32)
@@ -698,11 +686,14 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 		struct io_uring_cqe *cqe;
 		struct io_overflow_cqe *ocqe;
 
-		if (!io_get_cqe_overflow(ctx, &cqe, true))
-			break;
 		ocqe = list_first_entry(&ctx->cq_overflow_list,
 					struct io_overflow_cqe, list);
-		memcpy(cqe, &ocqe->cqe, cqe_size);
+
+		if (!dying) {
+			if (!io_get_cqe_overflow(ctx, &cqe, true))
+				break;
+			memcpy(cqe, &ocqe->cqe, cqe_size);
+		}
 		list_del(&ocqe->list);
 		kfree(ocqe);
 	}
@@ -714,20 +705,17 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 	io_cq_unlock_post(ctx);
 }
 
-static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
+static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
 {
-	/* iopoll syncs against uring_lock, not completion_lock */
-	if (ctx->flags & IORING_SETUP_IOPOLL)
-
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-05-13 12:48:06 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-05-13 12:48:06 -0700
commit	9961a785944601e32f185ea696347b22ffda634c (patch)
tree	24362a6360b2a1fa96c7eb562c6a95f96b37e459 /io_uring
parent	f4e8d80292859809ea135e9f4c43bae47e4f58bc (diff)
parent	deb1e496a83557896fe0cca0b8af01c2a97c0dc6 (diff)
download	linux-9961a785944601e32f185ea696347b22ffda634c.tar.gz linux-9961a785944601e32f185ea696347b22ffda634c.tar.bz2 linux-9961a785944601e32f185ea696347b22ffda634c.zip