diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 13:53:39 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 13:53:39 -0800 |
| commit | cce5fe5eda0581363a9c585dabf8a5923f15a708 (patch) | |
| tree | 1de4a7d3b67de40b68ae1764d3ed7631783e5bb9 | |
| parent | eca3a04f140a7380d8a7b4cd89d681706a69380c (diff) | |
| parent | 7d3fd88d61a41016da01889f076fd1c60c7298fc (diff) | |
| download | linux-cce5fe5eda0581363a9c585dabf8a5923f15a708.tar.gz linux-cce5fe5eda0581363a9c585dabf8a5923f15a708.tar.bz2 linux-cce5fe5eda0581363a9c585dabf8a5923f15a708.zip | |
Merge tag 'for-6.3/io_uring-2023-02-16' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
- Cleanup series making the async prep and handling of
REQ_F_FORCE_ASYNC easier to follow and verify (Dylan)
- Enable specifying specific flags for OP_MSG_RING (Breno)
- Enable use of KASAN with the internal request cache (Breno)
- Split the opcode definition structs into a hot and cold part (Breno)
- OP_MSG_RING fixes (Pavel, me)
- Fix an issue with IOPOLL cancelation and PREEMPT_NONE (me)
- Handle TIF_NOTIFY_RESUME for the io-wq threads that never return to
userspace (me)
- Add support for using io_uring_register() with a registered ring fd
(Josh)
- Improve handling of poll on the ring fd (Pavel)
- Series improving the task_work handling (Pavel)
- Misc cleanups, fixes, improvements (Dmitrii, Quanfa, Richard, Pavel,
me)
* tag 'for-6.3/io_uring-2023-02-16' of git://git.kernel.dk/linux: (51 commits)
io_uring: Support calling io_uring_register with a registered ring fd
io_uring,audit: don't log IORING_OP_MADVISE
io_uring: mark task TASK_RUNNING before handling resume/task work
io_uring: always go async for unsupported open flags
io_uring: always go async for unsupported fadvise flags
io_uring: for requests that require async, force it
io_uring: if a linked request has REQ_F_FORCE_ASYNC then run it async
io_uring: add reschedule point to handle_tw_list()
io_uring: add a conditional reschedule to the IOPOLL cancelation loop
io_uring: return normal tw run linking optimisation
io_uring: refactor tctx_task_work
io_uring: refactor io_put_task helpers
io_uring: refactor req allocation
io_uring: improve io_get_sqe
io_uring: kill outdated comment about overflow flush
io_uring: use user visible tail in io_uring_poll()
io_uring: pass in io_issue_def to io_assign_file()
io_uring: Enable KASAN for request cache
io_uring: handle TIF_NOTIFY_RESUME when checking for task_work
io_uring/msg-ring: ensure flags passing works for task_work completions
...
| -rw-r--r-- | include/linux/io_uring_types.h | 21 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 8 | ||||
| -rw-r--r-- | io_uring/advise.c | 29 | ||||
| -rw-r--r-- | io_uring/fs.c | 20 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 474 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 97 | ||||
| -rw-r--r-- | io_uring/msg_ring.c | 31 | ||||
| -rw-r--r-- | io_uring/net.c | 4 | ||||
| -rw-r--r-- | io_uring/notif.c | 3 | ||||
| -rw-r--r-- | io_uring/opdef.c | 340 | ||||
| -rw-r--r-- | io_uring/opdef.h | 13 | ||||
| -rw-r--r-- | io_uring/openclose.c | 18 | ||||
| -rw-r--r-- | io_uring/poll.c | 2 | ||||
| -rw-r--r-- | io_uring/rw.c | 4 | ||||
| -rw-r--r-- | io_uring/splice.c | 7 | ||||
| -rw-r--r-- | io_uring/sqpoll.c | 3 | ||||
| -rw-r--r-- | io_uring/sqpoll.h | 2 | ||||
| -rw-r--r-- | io_uring/statx.c | 4 | ||||
| -rw-r--r-- | io_uring/sync.c | 14 | ||||
| -rw-r--r-- | io_uring/xattr.c | 14 |
20 files changed, 699 insertions, 409 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 128a67a40065..0efe4d784358 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -195,21 +195,23 @@ struct io_alloc_cache { struct io_ring_ctx { /* const or read-mostly hot data */ struct { - struct percpu_ref refs; - - struct io_rings *rings; unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; - unsigned int drain_disabled: 1; unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; /* all CQEs should be posted only by the submitter task */ unsigned int task_complete: 1; + unsigned int syscall_iopoll: 1; + unsigned int poll_activated: 1; + unsigned int drain_disabled: 1; + unsigned int compat: 1; + + enum task_work_notify_mode notify_method; + struct io_rings *rings; + struct task_struct *submitter_task; + struct percpu_ref refs; } ____cacheline_aligned_in_smp; /* submission data */ @@ -293,6 +295,7 @@ struct io_ring_ctx { spinlock_t completion_lock; bool poll_multi_queue; + bool cq_waiting; /* * ->iopoll_list is protected by the ctx->uring_lock for @@ -318,9 +321,8 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* Keep this last, we don't need it for the fast path */ - + struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct task_struct *submitter_task; /* slow path rsrc auxilary data, used by update/register */ struct io_rsrc_node *rsrc_backup_node; @@ -357,6 +359,7 @@ struct io_ring_ctx { u32 iowq_limits[2]; bool iowq_limits_set; + struct callback_head poll_wq_task_work; struct list_head defer_list; unsigned sq_thread_idle; /* protected by ->completion_lock */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2780bce62faf..97661a60b28c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -347,6 +347,8 @@ enum { * applicable for IORING_MSG_DATA, obviously. */ #define IORING_MSG_RING_CQE_SKIP (1U << 0) +/* Pass through the flags from sqe->file_index to cqe->flags */ +#define IORING_MSG_RING_FLAGS_PASS (1U << 1) /* * IO completion data structure (Completion Queue Entry) @@ -470,6 +472,7 @@ struct io_uring_params { #define IORING_FEAT_RSRC_TAGS (1U << 10) #define IORING_FEAT_CQE_SKIP (1U << 11) #define IORING_FEAT_LINKED_FILE (1U << 12) +#define IORING_FEAT_REG_REG_RING (1U << 13) /* * io_uring_register(2) opcodes and arguments @@ -517,7 +520,10 @@ enum { IORING_REGISTER_FILE_ALLOC_RANGE = 25, /* this goes last */ - IORING_REGISTER_LAST + IORING_REGISTER_LAST, + + /* flag added to the opcode to use a registered ring fd */ + IORING_REGISTER_USE_REGISTERED_RING = 1U << 31 }; /* io-wq worker categories */ diff --git a/io_uring/advise.c b/io_uring/advise.c index 449c6f14649f..7085804c513c 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -39,6 +39,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ma->addr = READ_ONCE(sqe->addr); ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); + req->flags |= REQ_F_FORCE_ASYNC; return 0; #else return -EOPNOTSUPP; @@ -51,8 +52,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); @@ -62,6 +62,18 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) #endif } +static bool io_fadvise_force_async(struct io_fadvise *fa) +{ + switch (fa->advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + return false; + default: + return true; + } +} + int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); @@ -72,6 +84,8 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) fa->offset = READ_ONCE(sqe->off); fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); + if (io_fadvise_force_async(fa)) + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -80,16 +94,7 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa)); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) diff --git a/io_uring/fs.c b/io_uring/fs.c index 7100c293c13a..f6a69a549fd4 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -74,6 +74,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -82,8 +83,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, ren->newpath, ren->flags); @@ -123,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(un->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -131,8 +132,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (un->flags & AT_REMOVEDIR) ret = do_rmdir(un->dfd, un->filename); @@ -170,6 +170,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(mkd->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -178,8 +179,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); @@ -220,6 +220,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -228,8 +229,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); @@ -265,6 +265,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -273,8 +274,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, lnk->newpath, lnk->flags); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db623b3185c8..3b915deb4d08 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -151,7 +151,7 @@ static void io_move_task_work_from_local(struct io_ring_ctx *ctx); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static __cold void io_fallback_tw(struct io_uring_task *tctx); -static struct kmem_cache *req_cachep; +struct kmem_cache *req_cachep; struct sock *io_uring_get_socket(struct file *file) { @@ -230,6 +230,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + kasan_poison_object_data(req_cachep, req); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -245,17 +246,15 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = false; + bool locked = true; - percpu_ref_get(&ctx->refs); + mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &locked); - - if (locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - } - percpu_ref_put(&ctx->refs); + if (WARN_ON_ONCE(!locked)) + return; + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) @@ -316,6 +315,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); + init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); @@ -407,7 +407,7 @@ static inline void io_arm_ltimeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { @@ -572,6 +572,8 @@ static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { + if (ctx->poll_activated) + io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->drain_active) { @@ -618,6 +620,25 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } +static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + io_commit_cqring(ctx); + __io_cq_unlock(ctx); + io_commit_cqring_flush(ctx); + + /* + * As ->task_complete implies that the ring is single tasked, cq_wait + * may only be waited on by the current in io_cqring_wait(), but since + * it will re-check the wakeup conditions once we return we can safely + * skip waking it up. + */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { + smp_mb(); + __io_cqring_wake(ctx); + } +} + void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { @@ -645,7 +666,6 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) } } -/* Returns true if there are no backlogged entries after the flush */ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { size_t cqe_size = sizeof(struct io_uring_cqe); @@ -693,7 +713,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cqring_do_overflow_flush(ctx); } -void __io_put_task(struct task_struct *task, int nr) +/* can be called by any task */ +static void io_put_task_remote(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; @@ -703,6 +724,21 @@ void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } +/* used by a task to put its own references */ +static void io_put_task_local(struct task_struct *task, int nr) +{ + task->io_uring->cached_refs += nr; +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + io_put_task_local(task, nr); + else + io_put_task_remote(task, nr); +} + void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; @@ -945,15 +981,15 @@ static void __io_req_complete_post(struct io_kiocb *req) req->link = NULL; } } + io_put_kbuf_comp(req); + io_dismantle_req(req); io_req_put_rsrc(req); /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ - io_put_kbuf_comp(req); - io_dismantle_req(req); - io_put_task(req->task, 1); + io_put_task_remote(req->task, 1); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } @@ -980,7 +1016,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); @@ -1076,7 +1112,7 @@ __cold void io_free_req(struct io_kiocb *req) io_req_put_rsrc(req); io_dismantle_req(req); - io_put_task(req->task, 1); + io_put_task_remote(req->task, 1); spin_lock(&ctx->completion_lock); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); @@ -1130,7 +1166,7 @@ static unsigned int handle_tw_list(struct llist_node *node, { unsigned int count = 0; - while (node != last) { + while (node && node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1143,10 +1179,16 @@ static unsigned int handle_tw_list(struct llist_node *node, /* if not contended, grab and improve batching */ *locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); - } + } else if (!*locked) + *locked = mutex_trylock(&(*ctx)->uring_lock); req->io_task_work.func(req, locked); node = next; count++; + if (unlikely(need_resched())) { + ctx_flush_and_put(*ctx, locked); + *ctx = NULL; + cond_resched(); + } } return count; @@ -1190,23 +1232,29 @@ void tctx_task_work(struct callback_head *cb) task_work); struct llist_node fake = {}; struct llist_node *node; - unsigned int loops = 1; - unsigned int count; + unsigned int loops = 0; + unsigned int count = 0; if (unlikely(current->flags & PF_EXITING)) { io_fallback_tw(tctx); return; } - node = io_llist_xchg(&tctx->task_list, &fake); - count = handle_tw_list(node, &ctx, &uring_locked, NULL); - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - while (node != &fake) { + do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); count += handle_tw_list(node, &ctx, &uring_locked, &fake); + + /* skip expensive cmpxchg if there are items in the list */ + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { + io_submit_flush_completions(ctx); + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + } node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - } + } while (node != &fake); ctx_flush_and_put(ctx, &uring_locked); @@ -1241,7 +1289,7 @@ static void io_req_local_work_add(struct io_kiocb *req) percpu_ref_put(&ctx->refs); return; } - /* need it for the following io_cqring_wake() */ + /* needed for the following wake up */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { @@ -1252,10 +1300,11 @@ static void io_req_local_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) io_eventfd_signal(ctx); - __io_cqring_wake(ctx); + + if (READ_ONCE(ctx->cq_waiting)) + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); percpu_ref_put(&ctx->refs); } @@ -1296,21 +1345,19 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) { struct llist_node *node; - struct llist_node fake; - struct llist_node *current_final = NULL; - int ret; - unsigned int loops = 1; + unsigned int loops = 0; + int ret = 0; - if (unlikely(ctx->submitter_task != current)) + if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; - - node = io_llist_xchg(&ctx->work_llist, &fake); - ret = 0; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: - while (node != current_final) { + node = io_llist_xchg(&ctx->work_llist, NULL); + while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1319,26 +1366,20 @@ again: ret++; node = next; } + loops++; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL); - if (node != &fake) { - loops++; - current_final = &fake; - node = io_llist_xchg(&ctx->work_llist, &fake); + if (!llist_empty(&ctx->work_llist)) goto again; - } - - if (*locked) + if (*locked) { io_submit_flush_completions(ctx); + if (!llist_empty(&ctx->work_llist)) + goto again; + } trace_io_uring_local_work_run(ctx, ret, loops); return ret; - } -int io_run_local_work(struct io_ring_ctx *ctx) +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { bool locked; int ret; @@ -1346,8 +1387,19 @@ int io_run_local_work(struct io_ring_ctx *ctx) if (llist_empty(&ctx->work_llist)) return 0; - __set_current_state(TASK_RUNNING); - locked = mutex_trylock(&ctx->uring_lock); + locked = true; + ret = __io_run_local_work(ctx, &locked); + /* shouldn't happen! */ + if (WARN_ON_ONCE(!locked)) + mutex_lock(&ctx->uring_lock); + return ret; +} + +static int io_run_local_work(struct io_ring_ctx *ctx) +{ + bool locked = mutex_trylock(&ctx->uring_lock); + int ret; + ret = __io_run_local_work(ctx, &locked); if (locked) mutex_unlock(&ctx->uring_lock); @@ -1365,10 +1417,12 @@ void io_req_task_submit(struct io_kiocb *req, bool *locked) { io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ - if (likely(!(req->task->flags & PF_EXITING))) - io_queue_sqe(req); - else + if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); + else if (req->flags & REQ_F_FORCE_ASYNC) + io_queue_iowq(req, locked); + else + io_queue_sqe(req); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1467,7 +1521,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } } - __io_cq_unlock_post(ctx); + __io_cq_unlock_post_flush(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); @@ -1708,8 +1762,8 @@ unsigned int io_file_get_flags(struct file *file) bool io_alloc_async_data(struct io_kiocb *req) { - WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); - req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); + WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); + req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; @@ -1719,20 +1773,21 @@ bool io_alloc_async_data(struct io_kiocb *req) int io_req_prep_async(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->prep_async) + if (!cdef->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (!io_op_defs[req->opcode].manual_alloc) { + if (!def->manual_alloc) { if (io_alloc_async_data(req)) return -EAGAIN; } - return def->prep_async(req); + return cdef->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -1796,7 +1851,7 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_NEED_CLEANUP) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; if (def->cleanup) def->cleanup(req); @@ -1820,9 +1875,10 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, + unsigned int issue_flags) { - if (req->file || !io_op_defs[req->opcode].needs_file) + if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) @@ -1835,11 +1891,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; - if (unlikely(!io_assign_file(req, issue_flags))) + if (unlikely(!io_assign_file(req, def, issue_flags))) return -EBADF; if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) @@ -1889,7 +1945,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; @@ -1908,7 +1964,7 @@ fail: io_req_task_queue_fail(req, err); return; } - if (!io_assign_file(req, issue_flags)) { + if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; work->flags |= IO_WQ_WORK_CANCEL; goto fail; @@ -2104,7 +2160,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def; + const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; @@ -2122,7 +2178,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return -EINVAL; } - def = &io_op_defs[opcode]; + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2333,7 +2389,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -2351,14 +2407,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1; - return &ctx->sq_sqes[head]; + *sqe = &ctx->sq_sqes[head]; + return true; } /* drop invalid entries */ ctx->cq_extra--; WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); - return NULL; + return false; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) @@ -2379,11 +2436,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, &req))) break; - req = io_alloc_req(ctx); - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { + if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } @@ -2418,13 +2473,13 @@ struct io_wait_queue { struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; + ktime_t timeout; }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - !llist_empty(&ctx->work_llist)); + !llist_empty(&ctx->work_llist); } static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -2443,22 +2498,25 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, - wq); - struct io_ring_ctx *ctx = iowq->ctx; + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || io_has_work(ctx)) + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (io_run_task_work_ctx(ctx) > 0) + if (!llist_empty(&ctx->work_llist)) { + __set_current_state(TASK_RUNNING); + if (io_run_local_work(ctx) > 0) + return 1; + } + if (io_run_task_work() > 0) return 1; if (task_sigpending(current)) return -EINTR; @@ -2467,35 +2525,23 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t *timeout) + struct io_wait_queue *iowq) { - int ret; - unsigned long check_cq; - - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(ctx); - if (ret || io_should_wake(iowq)) - return ret; - - check_cq = READ_ONCE(ctx->check_cq); - if (unlikely(check_cq)) { - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) - return -EBADR; - } - if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(!llist_empty(&ctx->work_llist))) + return 1; + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; |
