diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-05-26 12:13:22 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-05-26 12:13:22 -0700 |
| commit | 49fffac983ac52aea0ab94914be3f56bcf92d5dc (patch) | |
| tree | 6d827dbf660c7b54430e448cfeec69b40dd93768 /io_uring | |
| parent | 6f59de9bc0d576eb5a5edfea470527902315e924 (diff) | |
| parent | 6faaf6e0faf1cc9a1359cfe6ecb4d9711b4a9f29 (diff) | |
| download | linux-49fffac983ac52aea0ab94914be3f56bcf92d5dc.tar.gz linux-49fffac983ac52aea0ab94914be3f56bcf92d5dc.tar.bz2 linux-49fffac983ac52aea0ab94914be3f56bcf92d5dc.zip | |
Merge tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
- Avoid indirect function calls in io-wq for executing and freeing
work.
The design of io-wq is such that it can be a generic mechanism, but
as it's just used by io_uring now, may as well avoid these indirect
calls
- Clean up registered buffers for networking
- Add support for IORING_OP_PIPE. Pretty straight forward, allows
creating pipes with io_uring, particularly useful for having these be
instantiated as direct descriptors
- Clean up the coalescing support fore registered buffers
- Add support for multiple interface queues for zero-copy rx
networking. As this feature was merged for 6.15 it supported just a
single ifq per ring
- Clean up the eventfd support
- Add dma-buf support to zero-copy rx
- Clean up and improving the request draining support
- Clean up provided buffer support, most notably with an eye toward
making the legacy support less intrusive
- Minor fdinfo cleanups, dropping support for dumping what credentials
are registered
- Improve support for overflow CQE handling, getting rid of GFP_ATOMIC
for allocating overflow entries where possible
- Improve detection of cases where io-wq doesn't need to spawn a new
worker unnecessarily
- Various little cleanups
* tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux: (59 commits)
io_uring/cmd: warn on reg buf imports by ineligible cmds
io_uring/io-wq: only create a new worker if it can make progress
io_uring/io-wq: ignore non-busy worker going to sleep
io_uring/io-wq: move hash helpers to the top
trace/io_uring: fix io_uring_local_work_run ctx documentation
io_uring: finish IOU_OK -> IOU_COMPLETE transition
io_uring: add new helpers for posting overflows
io_uring: pass in struct io_big_cqe to io_alloc_ocqe()
io_uring: make io_alloc_ocqe() take a struct io_cqe pointer
io_uring: split alloc and add of overflow
io_uring: open code io_req_cqe_overflow()
io_uring/fdinfo: get rid of dumping credentials
io_uring/fdinfo: only compile if CONFIG_PROC_FS is set
io_uring/kbuf: unify legacy buf provision and removal
io_uring/kbuf: refactor __io_remove_buffers
io_uring/kbuf: don't compute size twice on prep
io_uring/kbuf: drop extra vars in io_register_pbuf_ring
io_uring/kbuf: use mem_is_zero()
io_uring/kbuf: account ring io_buffer_list memory
io_uring: drain based on allocates reqs
...
Diffstat (limited to 'io_uring')
42 files changed, 944 insertions, 703 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index 3e28a741ca15..d97c6b51d584 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,11 +7,11 @@ GCOV_PROFILE := y endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ - tctx.o filetable.o rw.o net.o poll.o \ + tctx.o filetable.o rw.o poll.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - statx.o timeout.o fdinfo.o cancel.o \ + statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o @@ -19,3 +19,5 @@ obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o diff --git a/io_uring/advise.c b/io_uring/advise.c index cb7b881665e5..0073f74e3658 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; #else return -EOPNOTSUPP; #endif @@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 0870060bac7c..6d57602304df 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -229,7 +229,7 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_sync_cancel(struct io_uring_task *tctx, diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c new file mode 100644 index 000000000000..e99170c7d41a --- /dev/null +++ b/io_uring/cmd_net.c @@ -0,0 +1,83 @@ +#include <asm/ioctls.h> +#include <linux/io_uring/net.h> +#include <net/sock.h> + +#include "uring_cmd.h" + +static inline int io_uring_cmd_getsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optlen, optname, level, err; + void __user *optval; + + level = READ_ONCE(sqe->level); + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + + err = do_sock_getsockopt(sock, compat, level, optname, + USER_SOCKPTR(optval), + KERNEL_SOCKPTR(&optlen)); + if (err) + return err; + + /* On success, return optlen */ + return optlen; +} + +static inline int io_uring_cmd_setsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optname, optlen, level; + void __user *optval; + sockptr_t optval_s; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); + optval_s = USER_SOCKPTR(optval); + + return do_sock_setsockopt(sock, compat, level, optname, optval_s, + optlen); +} + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + switch (cmd->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + ret = prot->ioctl(sk, SIOCINQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_SIOCOUTQ: + ret = prot->ioctl(sk, SIOCOUTQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_GETSOCKOPT: + return io_uring_cmd_getsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_SETSOCKOPT: + return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 6d2c48ba1923..8d4610246ba0 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 100d5da94cb9..78f8ab7db104 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) io_eventfd_put(ev_fd); } -static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) -{ - if (put_ref) - io_eventfd_put(ev_fd); - rcu_read_unlock(); -} - /* * Returns true if the caller should put the ev_fd reference, false if not. */ @@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) /* * Trigger if eventfd_async isn't set, or if it's set and the caller is - * an async worker. If ev_fd isn't valid, obviously return false. + * an async worker. */ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { - if (ev_fd) - return !ev_fd->eventfd_async || io_wq_current_is_worker(); - return false; + return !ev_fd->eventfd_async || io_wq_current_is_worker(); } -/* - * On success, returns with an ev_fd reference grabbed and the RCU read - * lock held. - */ -static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { + bool skip = false; struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return NULL; - - rcu_read_lock(); + return; - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ + guard(rcu)(); ev_fd = rcu_dereference(ctx->io_ev_fd); - /* * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) - return ev_fd; - - rcu_read_unlock(); - return NULL; -} - -void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) - io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); -} - -void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) { - bool skip, put_ref = true; + if (!ev_fd) + return; + if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs)) + return; + if (cqe_event) { /* * Eventfd should only get triggered when at least one event * has been posted. Some applications rely on the eventfd @@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx) skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); - - if (!skip) - put_ref = __io_eventfd_signal(ev_fd); - - io_eventfd_release(ev_fd, put_ref); } + + if (skip || __io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h index d394f49c6321..e2f1985c2cf9 100644 --- a/io_uring/eventfd.h +++ b/io_uring/eventfd.h @@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async); int io_eventfd_unregister(struct io_ring_ctx *ctx); -void io_eventfd_flush_signal(struct io_ring_ctx *ctx); -void io_eventfd_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index e0d6a59a89fa..e9355276ab5d 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,37 +15,6 @@ #include "cancel.h" #include "rsrc.h" -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; -} - #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m, @@ -214,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) else seq_printf(m, "%5u: <none>\n", i); } - if (!xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { @@ -264,4 +225,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) mutex_unlock(&ctx->uring_lock); } } -#endif diff --git a/io_uring/fs.c b/io_uring/fs.c index eccea851dd5a..37079a414eab 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_renameat_cleanup(struct io_kiocb *req) @@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_unlinkat_cleanup(struct io_kiocb *req) @@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_mkdirat_cleanup(struct io_kiocb *req) @@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_link_cleanup(struct io_kiocb *req) diff --git a/io_uring/futex.c b/io_uring/futex.c index 0ea4820cd8ff..b34695022baa 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -234,7 +234,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) kfree(futexv); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; - return IOU_OK; + return IOU_COMPLETE; } /* @@ -311,7 +311,7 @@ done: req_set_fail(req); io_req_set_res(req, ret, 0); kfree(ifd); - return IOU_OK; + return IOU_COMPLETE; } int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) @@ -328,5 +328,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 04a75d666195..cd1fcb115739 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -114,9 +114,6 @@ enum { struct io_wq { unsigned long state; - free_work_fn *free_work; - io_wq_work_fn *do_work; - struct io_wq_hash *hash; atomic_t worker_refs; @@ -153,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); +static inline unsigned int __io_get_work_hash(unsigned int work_flags) +{ + return work_flags >> IO_WQ_HASH_SHIFT; +} + +static inline unsigned int io_get_work_hash(struct io_wq_work *work) +{ + return __io_get_work_hash(atomic_read(&work->flags)); +} + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -412,6 +419,30 @@ fail: return false; } +/* Defer if current and next work are both hashed to the same chain */ +static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct) +{ + unsigned int hash, work_flags; + struct io_wq_work *next; + + lockdep_assert_held(&acct->lock); + + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + + /* should not happen, io_acct_run_queue() said we had work */ + if (wq_list_empty(&acct->work_list)) + return true; + + hash = __io_get_work_hash(work_flags); + next = container_of(acct->work_list.first, struct io_wq_work, list); + work_flags = atomic_read(&next->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + return hash == __io_get_work_hash(work_flags); +} + static void io_wq_dec_running(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); @@ -422,8 +453,14 @@ static void io_wq_dec_running(struct io_worker *worker) if (!atomic_dec_and_test(&acct->nr_running)) return; + if (!worker->cur_work) + return; if (!io_acct_run_queue(acct)) return; + if (io_wq_hash_defer(worker->cur_work, acct)) { + raw_spin_unlock(&acct->lock); + return; + } raw_spin_unlock(&acct->lock); atomic_inc(&acct->nr_running); @@ -457,16 +494,6 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) } } -static inline unsigned int __io_get_work_hash(unsigned int work_flags) -{ - return work_flags >> IO_WQ_HASH_SHIFT; -} - -static inline unsigned int io_get_work_hash(struct io_wq_work *work) -{ - return __io_get_work_hash(atomic_read(&work->flags)); -} - static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { bool ret = false; @@ -612,10 +639,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (do_kill && (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); + io_wq_submit_work(work); io_assign_current_work(worker, NULL); - linked = wq->free_work(work); + linked = io_wq_free_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { work = linked; @@ -934,8 +961,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); - work = wq->free_work(work); + io_wq_submit_work(work); + work = io_wq_free_work(work); } while (work); } @@ -1195,8 +1222,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret, i; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); @@ -1206,8 +1231,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) refcount_inc(&data->hash->refs); wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; ret = -ENOMEM; diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index d4fb2940e435..774abab54732 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -21,9 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - struct io_wq_hash { refcount_t refs; unsigned long map; @@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash) struct io_wq_data { struct io_wq_hash *hash; struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 46373549a733..c7a9cecf528e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -129,7 +129,6 @@ struct io_defer_entry { struct list_head list; struct io_kiocb *req; - u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ @@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); +static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -380,25 +381,6 @@ err: return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) @@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } +static unsigned io_linked_nr(struct io_kiocb *req) +{ + struct io_kiocb *tmp; + unsigned nr = 0; + + io_for_each_link(tmp, req) + nr++; + return nr; +} + static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { - spin_lock(&ctx->completion_lock); + bool drain_seen = false, first = true; + + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req, de->seq)) - break; + drain_seen |= de->req->flags & REQ_F_IO_DRAIN; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); + first = false; } - spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx);< |
