diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-15 13:49:10 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-15 13:49:10 -0700 |
| commit | 3a56e241732975c2c1247047ddbfc0ac6f6a4905 (patch) | |
| tree | c365b9a90548e6f60e2773a486402c6384ea3ac1 /io_uring | |
| parent | 4f5e249ec0ea8872e1644df23cffffbe28007188 (diff) | |
| parent | ad00e629145b2b9f0d78aa46e204a9df7d628978 (diff) | |
| download | linux-3a56e241732975c2c1247047ddbfc0ac6f6a4905.tar.gz linux-3a56e241732975c2c1247047ddbfc0ac6f6a4905.tar.bz2 linux-3a56e241732975c2c1247047ddbfc0ac6f6a4905.zip | |
Merge tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
"Here are the io_uring updates queued up for 6.11.
Nothing major this time around, various minor improvements and
cleanups/fixes. This contains:
- Add bind/listen opcodes. Main motivation is to support direct
descriptors, to avoid needing a regular fd just for doing these two
operations (Gabriel)
- Probe fixes (Gabriel)
- Treat io-wq work flags as atomics. Not fixing a real issue, but may
as well and it silences a KCSAN warning (me)
- Cleanup of rsrc __set_current_state() usage (me)
- Add 64-bit for {m,f}advise operations (me)
- Improve performance of data ring messages (me)
- Fix for ring message overflow posting (Pavel)
- Fix for freezer interaction with TWA_NOTIFY_SIGNAL. Not strictly an
io_uring thing, but since TWA_NOTIFY_SIGNAL was originally added
for faster task_work signaling for io_uring, bundling it with this
pull (Pavel)
- Add Pavel as a co-maintainer
- Various cleanups (me, Thorsten)"
* tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux: (28 commits)
io_uring/net: check socket is valid in io_bind()/io_listen()
kernel: rerun task_work while freezing in get_signal()
io_uring/io-wq: limit retrying worker initialisation
io_uring/napi: Remove unnecessary s64 cast
io_uring/net: cleanup io_recv_finish() bundle handling
io_uring/msg_ring: fix overflow posting
MAINTAINERS: change Pavel Begunkov from io_uring reviewer to maintainer
io_uring/msg_ring: use kmem_cache_free() to free request
io_uring/msg_ring: check for dead submitter task
io_uring/msg_ring: add an alloc cache for io_kiocb entries
io_uring/msg_ring: improve handling of target CQE posting
io_uring: add io_add_aux_cqe() helper
io_uring: add remote task_work execution helper
io_uring/msg_ring: tighten requirement for remote posting
io_uring: Allocate only necessary memory in io_probe
io_uring: Fix probe of disabled operations
io_uring: Introduce IORING_OP_LISTEN
io_uring: Introduce IORING_OP_BIND
net: Split a __sys_listen helper for io_uring
net: Split a __sys_bind helper for io_uring
...
Diffstat (limited to 'io_uring')
| -rw-r--r-- | io_uring/Makefile | 6 | ||||
| -rw-r--r-- | io_uring/advise.c | 16 | ||||
| -rw-r--r-- | io_uring/eventfd.c | 160 | ||||
| -rw-r--r-- | io_uring/eventfd.h | 8 | ||||
| -rw-r--r-- | io_uring/io-wq.c | 29 | ||||
| -rw-r--r-- | io_uring/io-wq.h | 2 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 150 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 9 | ||||
| -rw-r--r-- | io_uring/msg_ring.c | 122 | ||||
| -rw-r--r-- | io_uring/msg_ring.h | 1 | ||||
| -rw-r--r-- | io_uring/napi.c | 2 | ||||
| -rw-r--r-- | io_uring/net.c | 94 | ||||
| -rw-r--r-- | io_uring/net.h | 6 | ||||
| -rw-r--r-- | io_uring/opdef.c | 34 | ||||
| -rw-r--r-- | io_uring/opdef.h | 4 | ||||
| -rw-r--r-- | io_uring/register.c | 65 | ||||
| -rw-r--r-- | io_uring/rsrc.c | 63 |
17 files changed, 490 insertions, 281 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index fc1b23c524e8..61923e11c767 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,9 +4,9 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o net.o poll.o \ - uring_cmd.o openclose.o sqpoll.o \ - xattr.o nop.o fs.o splice.o sync.o \ - msg_ring.o advise.o openclose.o \ + eventfd.o uring_cmd.o openclose.o \ + sqpoll.o xattr.o nop.o fs.o splice.o \ + sync.o msg_ring.o advise.o openclose.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o diff --git a/io_uring/advise.c b/io_uring/advise.c index 7085804c513c..cb7b881665e5 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -17,14 +17,14 @@ struct io_fadvise { struct file *file; u64 offset; - u32 len; + u64 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; - u32 len; + u64 len; u32 advice; }; @@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); - ma->len = READ_ONCE(sqe->len); + ma->len = READ_ONCE(sqe->off); + if (!ma->len) + ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; @@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); - fa->len = READ_ONCE(sqe->len); + fa->len = READ_ONCE(sqe->addr); + if (!fa->len) + fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c new file mode 100644 index 000000000000..b9384503a2b7 --- /dev/null +++ b/io_uring/eventfd.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/eventfd.h> +#include <linux/eventpoll.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> + +#include "io-wq.h" +#include "eventfd.h" + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; + atomic_t refs; + atomic_t ops; +}; + +enum { + IO_EVENTFD_OP_SIGNAL_BIT, +}; + +static void io_eventfd_free(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} + +static void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + + if (atomic_dec_and_test(&ev_fd->refs)) + io_eventfd_free(rcu); +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd = NULL; + + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); + + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + return; + if (!atomic_inc_not_zero(&ev_fd->refs)) + return; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + goto out; + + if (likely(eventfd_signal_allowed())) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + } else { + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return; + } + } +out: + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +{ + bool skip; + + spin_lock(&ctx->completion_lock); + + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count + * only changing IFF a new CQE has been added to the CQ ring. There's + * no depedency on 1:1 relationship between how many times this + * function is called (and hence the eventfd count) and number of CQEs + * posted to the CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; + + io_eventfd_signal(ctx); +} + +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) +{ + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; + + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); + return ret; + } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + atomic_set(&ev_fd->refs, 1); + atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; +} + +int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); + return 0; + } + + return -ENXIO; +} diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h new file mode 100644 index 000000000000..d394f49c6321 --- /dev/null +++ b/io_uring/eventfd.h @@ -0,0 +1,8 @@ + +struct io_ring_ctx; +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async); +int io_eventfd_unregister(struct io_ring_ctx *ctx); + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx); diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7d3316fe9bfc..f1e7c670add8 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -23,6 +23,7 @@ #include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) +#define WORKER_INIT_LIMIT 3 enum { IO_WORKER_F_UP = 0, /* up and active */ @@ -58,6 +59,7 @@ struct io_worker { unsigned long create_state; struct callback_head create_work; + int init_retries; union { struct rcu_head rcu; @@ -159,7 +161,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, struct io_wq_work *work) { - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) @@ -451,7 +453,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return work->flags >> IO_WQ_HASH_SHIFT; + return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -592,8 +594,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, next_hashed = wq_next_work(work); - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; + if (do_kill && + (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -744,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data) return true; } -static inline bool io_should_retry_thread(long err) +static inline bool io_should_retry_thread(struct io_worker *worker, long err) { /* * Prevent perpetual task_work retry, if the task (or its group) is @@ -752,6 +755,8 @@ static inline bool io_should_retry_thread(long err) */ if (fatal_signal_pending(current)) return false; + if (worker->init_retries++ >= WORKER_INIT_LIMIT) + return false; switch (err) { case -EAGAIN: @@ -778,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb) io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); @@ -845,7 +850,7 @@ fail: tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wq, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; } else { @@ -891,7 +896,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); work = wq->free_work(work); } while (work); @@ -926,7 +931,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); - unsigned long work_flags = work->flags; + unsigned int work_flags = atomic_read(&work->flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -939,7 +944,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) * been marked as one that should not get executed, cancel it here. */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { + (work_flags & IO_WQ_WORK_CANCEL)) { io_run_cancel(work, wq); return; } @@ -982,7 +987,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); } static bool __io_wq_worker_cancel(struct io_worker *worker, @@ -990,7 +995,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, struct io_wq_work *work) { if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); __set_notify_signal(worker->task); return true; } diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 2b2a6406dd8e..b3b004a7b625 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return work->flags & IO_WQ_WORK_HASHED; + return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c326e2127dd4..8e6faa942a6f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,12 +95,14 @@ #include "futex.h" #include "napi.h" #include "uring_cmd.h" +#include "msg_ring.h" #include "memmap.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" +#include "eventfd.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -314,6 +316,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, sizeof(struct uring_cache)); + spin_lock_init(&ctx->msg_lock); + ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_kiocb)); ret |= io_futex_cache_init(ctx); if (ret) goto err; @@ -350,6 +355,7 @@ err: io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); @@ -461,9 +467,9 @@ static void io_prep_async_work(struct io_kiocb *req) } req->work.list.next = NULL; - req->work.flags = 0; + atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); @@ -479,7 +485,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } @@ -519,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req) * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); @@ -541,84 +547,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void io_eventfd_ops(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - int ops = atomic_xchg(&ev_fd->ops, 0); - - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback - * ordering in a race but if references are 0 we know we have to free - * it regardless. - */ - if (atomic_dec_and_test(&ev_fd->refs)) { - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); - } -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd = NULL; - - rcu_read_lock(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - atomic_inc(&ev_fd->refs); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); - else - atomic_dec(&ev_fd->refs); - } - -out: - rcu_read_unlock(); -} - -static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; - - io_eventfd_signal(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) @@ -878,20 +806,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { bool filled; - io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + return filled; +} + +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + bool filled; + + io_cq_lock(ctx); + filled = __io_post_aux_cqe(ctx, user_data, res, cflags); io_cq_unlock_post(ctx); return filled; } /* + * Must be called from inline task_work so we now a flush will happen later, + * and obviously with ctx->uring_lock held (tw always has that). + */ +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + spin_lock(&ctx->completion_lock); + io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + spin_unlock(&ctx->completion_lock); + } + ctx->submit_state.cq_flush = true; +} + +/* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ @@ -1175,9 +1126,10 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +static inline void io_req_local_work_add(struct io_kiocb *req, + struct io_ring_ctx *ctx, + unsigned flags) { - struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1191,6 +1143,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; + guard(rcu)(); + head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; @@ -1272,13 +1226,18 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - rcu_read_lock(); - io_req_local_work_add(req, flags); - rcu_read_unlock(); - } else { + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, req->ctx, flags); + else io_req_normal_work_add(req); - } +} + +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags) +{ + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + io_req_local_work_add(req, ctx, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) @@ -1467,7 +1426,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } __io_cq_unlock_post(ctx); - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } @@ -1813,14 +1772,14 @@ void io_wq_submit_work(struct io_wq_work *work) io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } @@ -2649,6 +2608,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 726e6367af4d..e1ce908f0679 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,6 +65,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); @@ -73,6 +74,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); @@ -104,12 +107,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -enum { - IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, -}; - -void io_eventfd_ops(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 81c4a9d43729..29fa9285a33d 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,9 +11,9 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" +#include "alloc_cache.h" #include "msg_ring.h" - /* All valid masks for MSG_RING */ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ IORING_MSG_RING_FLAGS_PASS) @@ -68,59 +68,70 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - if (!target_ctx->task_complete) - return false; - return current != target_ctx->submitter_task; + return target_ctx->task_complete; } -static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) +static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) { - struct io_ring_ctx *ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct task_struct *task = READ_ONCE(ctx->submitter_task); + struct io_ring_ctx *ctx = req->ctx; - if (unlikely(!task)) - return -EOWNERDEAD; + io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); + if (spin_trylock(&ctx->msg_lock)) { + if (io_alloc_cache_put(&ctx->msg_cache, req)) + req = NULL; + spin_unlock(&ctx->msg_lock); + } + if (req) + kmem_cache_free(req_cachep, req); + percpu_ref_put(&ctx->refs); +} - init_task_work(&msg->tw, func); - if (task_work_add(task, &msg->tw, TWA_SIGNAL)) +static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, + int res, u32 cflags, u64 user_data) +{ + req->task = READ_ONCE(ctx->submitter_task); + if (!req->task) { + kmem_cache_free(req_cachep, req); return -EOWNERDEAD; + } + req->cqe.user_data = user_data; + io_req_set_res(req, res, cflags); + percpu_ref_get(&ctx->refs); + req->ctx = ctx; + req->io_task_work.func = io_msg_tw_complete; + io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + return 0; +} - return IOU_ISSUE_SKIP_COMPLETE; +static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req = NULL; + + if (spin_trylock(&ctx->msg_lock)) { + req = io_alloc_cache_get(&ctx->msg_cache); + spin_unlock(&ctx->msg_lock); + } + if (req) + return req; + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN); } -static void io_msg_tw_complete(struct callback_head *head) +static int io_msg_data_remote(struct io_kiocb *req) { - struct io_msg *msg = container_of(head, struct io_msg, tw); - struct io_kiocb *req = cmd_to_io_kiocb(msg); struct io_ring_ctx *target_ctx = req->file->private_data; - int ret = 0; - - if (current->flags & PF_EXITING) { - ret = -EOWNERDEAD; - } else { - u32 flags = 0; - - if (msg->flags & IORING_MSG_RING_FLAGS_PASS) - flags = msg->cqe_flags; - - /* - * If the target ring is using IOPOLL mode, then we need to be - * holding the uring_lock for posting completions. Other ring - * types rely on the regular completion locking, which is - * handled while posting. - */ - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = -EOVERFLOW; - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&target_ctx->uring_lock); - } + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct io_kiocb *target; + u32 flags = 0; - if (ret < 0) - req_set_fail(req); - io_req_queue_tw_complete(req, ret); + target = io_msg_get_kiocb(req->ctx); + if (unlikely(!target)) + return -ENOMEM; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + + return io_msg_remote_post(target_ctx, target, msg->len, flags, + msg->user_data); } static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) @@ -138,7 +149,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_complete); + return io_msg_data_remote(req); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; @@ -218,6 +229,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head) io_req_queue_tw_complete(req, ret); } +static int io_msg_fd_remote(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct task_struct *task = READ_ONCE(ctx->submitter_task); + + if (unlikely(!task)) + return -EOWNERDEAD; + + init_task_work(&msg->tw, io_msg_tw_fd_complete); + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) + return -EOWNERDEAD; + + return IOU_ISSUE_SKIP_COMPLETE; +} + static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) |
