summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 12:13:22 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 12:13:22 -0700
commit49fffac983ac52aea0ab94914be3f56bcf92d5dc (patch)
tree6d827dbf660c7b54430e448cfeec69b40dd93768 /io_uring
parent6f59de9bc0d576eb5a5edfea470527902315e924 (diff)
parent6faaf6e0faf1cc9a1359cfe6ecb4d9711b4a9f29 (diff)
downloadlinux-49fffac983ac52aea0ab94914be3f56bcf92d5dc.tar.gz
linux-49fffac983ac52aea0ab94914be3f56bcf92d5dc.tar.bz2
linux-49fffac983ac52aea0ab94914be3f56bcf92d5dc.zip
Merge tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Avoid indirect function calls in io-wq for executing and freeing work. The design of io-wq is such that it can be a generic mechanism, but as it's just used by io_uring now, may as well avoid these indirect calls - Clean up registered buffers for networking - Add support for IORING_OP_PIPE. Pretty straight forward, allows creating pipes with io_uring, particularly useful for having these be instantiated as direct descriptors - Clean up the coalescing support fore registered buffers - Add support for multiple interface queues for zero-copy rx networking. As this feature was merged for 6.15 it supported just a single ifq per ring - Clean up the eventfd support - Add dma-buf support to zero-copy rx - Clean up and improving the request draining support - Clean up provided buffer support, most notably with an eye toward making the legacy support less intrusive - Minor fdinfo cleanups, dropping support for dumping what credentials are registered - Improve support for overflow CQE handling, getting rid of GFP_ATOMIC for allocating overflow entries where possible - Improve detection of cases where io-wq doesn't need to spawn a new worker unnecessarily - Various little cleanups * tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux: (59 commits) io_uring/cmd: warn on reg buf imports by ineligible cmds io_uring/io-wq: only create a new worker if it can make progress io_uring/io-wq: ignore non-busy worker going to sleep io_uring/io-wq: move hash helpers to the top trace/io_uring: fix io_uring_local_work_run ctx documentation io_uring: finish IOU_OK -> IOU_COMPLETE transition io_uring: add new helpers for posting overflows io_uring: pass in struct io_big_cqe to io_alloc_ocqe() io_uring: make io_alloc_ocqe() take a struct io_cqe pointer io_uring: split alloc and add of overflow io_uring: open code io_req_cqe_overflow() io_uring/fdinfo: get rid of dumping credentials io_uring/fdinfo: only compile if CONFIG_PROC_FS is set io_uring/kbuf: unify legacy buf provision and removal io_uring/kbuf: refactor __io_remove_buffers io_uring/kbuf: don't compute size twice on prep io_uring/kbuf: drop extra vars in io_register_pbuf_ring io_uring/kbuf: use mem_is_zero() io_uring/kbuf: account ring io_buffer_list memory io_uring: drain based on allocates reqs ...
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile6
-rw-r--r--io_uring/advise.c4
-rw-r--r--io_uring/cancel.c2
-rw-r--r--io_uring/cmd_net.c83
-rw-r--r--io_uring/epoll.c4
-rw-r--r--io_uring/eventfd.c66
-rw-r--r--io_uring/eventfd.h3
-rw-r--r--io_uring/fdinfo.c40
-rw-r--r--io_uring/fs.c10
-rw-r--r--io_uring/futex.c6
-rw-r--r--io_uring/io-wq.c65
-rw-r--r--io_uring/io-wq.h5
-rw-r--r--io_uring/io_uring.c285
-rw-r--r--io_uring/io_uring.h4
-rw-r--r--io_uring/kbuf.c148
-rw-r--r--io_uring/kbuf.h8
-rw-r--r--io_uring/memmap.c11
-rw-r--r--io_uring/memmap.h4
-rw-r--r--io_uring/msg_ring.c2
-rw-r--r--io_uring/net.c62
-rw-r--r--io_uring/nop.c2
-rw-r--r--io_uring/notif.c1
-rw-r--r--io_uring/opdef.c11
-rw-r--r--io_uring/openclose.c139
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/poll.c4
-rw-r--r--io_uring/rsrc.c91
-rw-r--r--io_uring/rsrc.h28
-rw-r--r--io_uring/rw.c7
-rw-r--r--io_uring/rw.h2
-rw-r--r--io_uring/splice.c4
-rw-r--r--io_uring/statx.c2
-rw-r--r--io_uring/sync.c6
-rw-r--r--io_uring/tctx.c2
-rw-r--r--io_uring/timeout.c13
-rw-r--r--io_uring/timeout.h13
-rw-r--r--io_uring/truncate.c2
-rw-r--r--io_uring/uring_cmd.c91
-rw-r--r--io_uring/waitid.c2
-rw-r--r--io_uring/xattr.c8
-rw-r--r--io_uring/zcrx.c372
-rw-r--r--io_uring/zcrx.h26
42 files changed, 944 insertions, 703 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 3e28a741ca15..d97c6b51d584 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,11 +7,11 @@ GCOV_PROFILE := y
endif
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
- tctx.o filetable.o rw.o net.o poll.o \
+ tctx.o filetable.o rw.o poll.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
- statx.o timeout.o fdinfo.o cancel.o \
+ statx.o timeout.o cancel.o \
waitid.o register.o truncate.o \
memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
@@ -19,3 +19,5 @@ obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
+obj-$(CONFIG_NET) += net.o cmd_net.o
+obj-$(CONFIG_PROC_FS) += fdinfo.o
diff --git a/io_uring/advise.c b/io_uring/advise.c
index cb7b881665e5..0073f74e3658 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
#else
return -EOPNOTSUPP;
#endif
@@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 0870060bac7c..6d57602304df 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -229,7 +229,7 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int __io_sync_cancel(struct io_uring_task *tctx,
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
new file mode 100644
index 000000000000..e99170c7d41a
--- /dev/null
+++ b/io_uring/cmd_net.c
@@ -0,0 +1,83 @@
+#include <asm/ioctls.h>
+#include <linux/io_uring/net.h>
+#include <net/sock.h>
+
+#include "uring_cmd.h"
+
+static inline int io_uring_cmd_getsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optlen, optname, level, err;
+ void __user *optval;
+
+ level = READ_ONCE(sqe->level);
+ if (level != SOL_SOCKET)
+ return -EOPNOTSUPP;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+
+ err = do_sock_getsockopt(sock, compat, level, optname,
+ USER_SOCKPTR(optval),
+ KERNEL_SOCKPTR(&optlen));
+ if (err)
+ return err;
+
+ /* On success, return optlen */
+ return optlen;
+}
+
+static inline int io_uring_cmd_setsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optname, optlen, level;
+ void __user *optval;
+ sockptr_t optval_s;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+ level = READ_ONCE(sqe->level);
+ optval_s = USER_SOCKPTR(optval);
+
+ return do_sock_setsockopt(sock, compat, level, optname, optval_s,
+ optlen);
+}
+
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct socket *sock = cmd->file->private_data;
+ struct sock *sk = sock->sk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ int ret, arg = 0;
+
+ if (!prot || !prot->ioctl)
+ return -EOPNOTSUPP;
+
+ switch (cmd->cmd_op) {
+ case SOCKET_URING_OP_SIOCINQ:
+ ret = prot->ioctl(sk, SIOCINQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_SIOCOUTQ:
+ ret = prot->ioctl(sk, SIOCOUTQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_GETSOCKOPT:
+ return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
+ case SOCKET_URING_OP_SETSOCKOPT:
+ return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index 6d2c48ba1923..8d4610246ba0 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 100d5da94cb9..78f8ab7db104 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
io_eventfd_put(ev_fd);
}
-static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
-{
- if (put_ref)
- io_eventfd_put(ev_fd);
- rcu_read_unlock();
-}
-
/*
* Returns true if the caller should put the ev_fd reference, false if not.
*/
@@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
/*
* Trigger if eventfd_async isn't set, or if it's set and the caller is
- * an async worker. If ev_fd isn't valid, obviously return false.
+ * an async worker.
*/
static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
{
- if (ev_fd)
- return !ev_fd->eventfd_async || io_wq_current_is_worker();
- return false;
+ return !ev_fd->eventfd_async || io_wq_current_is_worker();
}
-/*
- * On success, returns with an ev_fd reference grabbed and the RCU read
- * lock held.
- */
-static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
+ bool skip = false;
struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return NULL;
-
- rcu_read_lock();
+ return;
- /*
- * rcu_dereference ctx->io_ev_fd once and use it for both for checking
- * and eventfd_signal
- */
+ guard(rcu)();
ev_fd = rcu_dereference(ctx->io_ev_fd);
-
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
- if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
- return ev_fd;
-
- rcu_read_unlock();
- return NULL;
-}
-
-void io_eventfd_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd)
- io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
-}
-
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd) {
- bool skip, put_ref = true;
+ if (!ev_fd)
+ return;
+ if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs))
+ return;
+ if (cqe_event) {
/*
* Eventfd should only get triggered when at least one event
* has been posted. Some applications rely on the eventfd
@@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
-
- if (!skip)
- put_ref = __io_eventfd_signal(ev_fd);
-
- io_eventfd_release(ev_fd, put_ref);
}
+
+ if (skip || __io_eventfd_signal(ev_fd))
+ io_eventfd_put(ev_fd);
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h
index d394f49c6321..e2f1985c2cf9 100644
--- a/io_uring/eventfd.h
+++ b/io_uring/eventfd.h
@@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async);
int io_eventfd_unregister(struct io_ring_ctx *ctx);
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
-void io_eventfd_signal(struct io_ring_ctx *ctx);
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event);
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index e0d6a59a89fa..e9355276ab5d 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -15,37 +15,6 @@
#include "cancel.h"
#include "rsrc.h"
-#ifdef CONFIG_PROC_FS
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
- const struct cred *cred)
-{
- struct user_namespace *uns = seq_user_ns(m);
- struct group_info *gi;
- kernel_cap_t cap;
- int g;
-
- seq_printf(m, "%5d\n", id);
- seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
- seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
- seq_puts(m, "\n\tGroups:\t");
- gi = cred->group_info;
- for (g = 0; g < gi->ngroups; g++) {
- seq_put_decimal_ull(m, g ? " " : "",
- from_kgid_munged(uns, gi->gid[g]));
- }
- seq_puts(m, "\n\tCapEff:\t");
- cap = cred->cap_effective;
- seq_put_hex_ll(m, NULL, cap.val, 16);
- seq_putc(m, '\n');
- return 0;
-}
-
#ifdef CONFIG_NET_RX_BUSY_POLL
static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m,
@@ -214,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
else
seq_printf(m, "%5u: <none>\n", i);
}
- if (!xa_empty(&ctx->personalities)) {
- unsigned long index;
- const struct cred *cred;
-
- seq_printf(m, "Personalities:\n");
- xa_for_each(&ctx->personalities, index, cred)
- io_uring_show_cred(m, index, cred);
- }
seq_puts(m, "PollList:\n");
for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
@@ -264,4 +225,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
mutex_unlock(&ctx->uring_lock);
}
}
-#endif
diff --git a/io_uring/fs.c b/io_uring/fs.c
index eccea851dd5a..37079a414eab 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_renameat_cleanup(struct io_kiocb *req)
@@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_unlinkat_cleanup(struct io_kiocb *req)
@@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_mkdirat_cleanup(struct io_kiocb *req)
@@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_link_cleanup(struct io_kiocb *req)
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 0ea4820cd8ff..b34695022baa 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -234,7 +234,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
kfree(futexv);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
- return IOU_OK;
+ return IOU_COMPLETE;
}
/*
@@ -311,7 +311,7 @@ done:
req_set_fail(req);
io_req_set_res(req, ret, 0);
kfree(ifd);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
@@ -328,5 +328,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 04a75d666195..cd1fcb115739 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -114,9 +114,6 @@ enum {
struct io_wq {
unsigned long state;
- free_work_fn *free_work;
- io_wq_work_fn *do_work;
-
struct io_wq_hash *hash;
atomic_t worker_refs;
@@ -153,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq,
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);
+static inline unsigned int __io_get_work_hash(unsigned int work_flags)
+{
+ return work_flags >> IO_WQ_HASH_SHIFT;
+}
+
+static inline unsigned int io_get_work_hash(struct io_wq_work *work)
+{
+ return __io_get_work_hash(atomic_read(&work->flags));
+}
+
static bool io_worker_get(struct io_worker *worker)
{
return refcount_inc_not_zero(&worker->ref);
@@ -412,6 +419,30 @@ fail:
return false;
}
+/* Defer if current and next work are both hashed to the same chain */
+static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct)
+{
+ unsigned int hash, work_flags;
+ struct io_wq_work *next;
+
+ lockdep_assert_held(&acct->lock);
+
+ work_flags = atomic_read(&work->flags);
+ if (!__io_wq_is_hashed(work_flags))
+ return false;
+
+ /* should not happen, io_acct_run_queue() said we had work */
+ if (wq_list_empty(&acct->work_list))
+ return true;
+
+ hash = __io_get_work_hash(work_flags);
+ next = container_of(acct->work_list.first, struct io_wq_work, list);
+ work_flags = atomic_read(&next->flags);
+ if (!__io_wq_is_hashed(work_flags))
+ return false;
+ return hash == __io_get_work_hash(work_flags);
+}
+
static void io_wq_dec_running(struct io_worker *worker)
{
struct io_wq_acct *acct = io_wq_get_acct(worker);
@@ -422,8 +453,14 @@ static void io_wq_dec_running(struct io_worker *worker)
if (!atomic_dec_and_test(&acct->nr_running))
return;
+ if (!worker->cur_work)
+ return;
if (!io_acct_run_queue(acct))
return;
+ if (io_wq_hash_defer(worker->cur_work, acct)) {
+ raw_spin_unlock(&acct->lock);
+ return;
+ }
raw_spin_unlock(&acct->lock);
atomic_inc(&acct->nr_running);
@@ -457,16 +494,6 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker)
}
}
-static inline unsigned int __io_get_work_hash(unsigned int work_flags)
-{
- return work_flags >> IO_WQ_HASH_SHIFT;
-}
-
-static inline unsigned int io_get_work_hash(struct io_wq_work *work)
-{
- return __io_get_work_hash(atomic_read(&work->flags));
-}
-
static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
{
bool ret = false;
@@ -612,10 +639,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
if (do_kill &&
(work_flags & IO_WQ_WORK_UNBOUND))
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
+ io_wq_submit_work(work);
io_assign_current_work(worker, NULL);
- linked = wq->free_work(work);
+ linked = io_wq_free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
@@ -934,8 +961,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
- work = wq->free_work(work);
+ io_wq_submit_work(work);
+ work = io_wq_free_work(work);
} while (work);
}
@@ -1195,8 +1222,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret, i;
struct io_wq *wq;
- if (WARN_ON_ONCE(!data->free_work || !data->do_work))
- return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
@@ -1206,8 +1231,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
- wq->free_work = data->free_work;
- wq->do_work = data->do_work;
ret = -ENOMEM;
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index d4fb2940e435..774abab54732 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,9 +21,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work *);
-
struct io_wq_hash {
refcount_t refs;
unsigned long map;
@@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash)
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
- io_wq_work_fn *do_work;
- free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 46373549a733..c7a9cecf528e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
- u32 seq;
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_HLIST_HEAD(&ctx->waitid_list);
+ xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
#ifdef CONFIG_FUTEX
INIT_HLIST_HEAD(&ctx->futex_list);
#endif
@@ -380,25 +381,6 @@ err:
return NULL;
}
-static void io_account_cq_overflow(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
- ctx->cq_extra--;
-}
-
-static bool req_need_defer(struct io_kiocb *req, u32 seq)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
- }
-
- return false;
-}
-
static void io_clean_op(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
@@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
+static unsigned io_linked_nr(struct io_kiocb *req)
+{
+ struct io_kiocb *tmp;
+ unsigned nr = 0;
+
+ io_for_each_link(tmp, req)
+ nr++;
+ return nr;
+}
+
static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
- spin_lock(&ctx->completion_lock);
+ bool drain_seen = false, first = true;
+
+ lockdep_assert_held(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
+
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
- if (req_need_defer(de->req, de->seq))
- break;
+ drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
+ if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+ return;
+
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue(de->req);
kfree(de);
+ first = false;
}
- spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);<