diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 12:30:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 12:30:26 -0700 |
commit | 0aa69d53ac7c30f6184f88f2e310d808b32b35a5 (patch) | |
tree | 26a666a128578dd1c46f39b432543411caa0d6eb | |
parent | 3eccc0c886b1796f95a289c9d127c8ca1a254bd5 (diff) | |
parent | c98c81a4ac37b651be7eb9d16f562fc4acc5f867 (diff) | |
download | linux-0aa69d53ac7c30f6184f88f2e310d808b32b35a5.tar.gz linux-0aa69d53ac7c30f6184f88f2e310d808b32b35a5.tar.bz2 linux-0aa69d53ac7c30f6184f88f2e310d808b32b35a5.zip |
Merge tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
"Nothing major in this release, just a bunch of cleanups and some
optimizations around networking mostly.
- clean up file request flags handling (Christoph)
- clean up request freeing and CQ locking (Pavel)
- support for using pre-registering the io_uring fd at setup time
(Josh)
- Add support for user allocated ring memory, rather than having the
kernel allocate it. Mostly for packing rings into a huge page (me)
- avoid an unnecessary double retry on receive (me)
- maintain ordering for task_work, which also improves performance
(me)
- misc cleanups/fixes (Pavel, me)"
* tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux: (39 commits)
io_uring: merge conditional unlock flush helpers
io_uring: make io_cq_unlock_post static
io_uring: inline __io_cq_unlock
io_uring: fix acquire/release annotations
io_uring: kill io_cq_unlock()
io_uring: remove IOU_F_TWQ_FORCE_NORMAL
io_uring: don't batch task put on reqs free
io_uring: move io_clean_op()
io_uring: inline io_dismantle_req()
io_uring: remove io_free_req_tw
io_uring: open code io_put_req_find_next
io_uring: add helpers to decode the fixed file file_ptr
io_uring: use io_file_from_index in io_msg_grab_file
io_uring: use io_file_from_index in __io_sync_cancel
io_uring: return REQ_F_ flags from io_file_get_flags
io_uring: remove io_req_ffs_set
io_uring: remove a confusing comment above io_file_get_flags
io_uring: remove the mode variable in io_file_get_flags
io_uring: remove __io_file_supports_nowait
io_uring: wait interruptibly for request completions on exit
...
-rw-r--r-- | block/fops.c | 5 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 4 | ||||
-rw-r--r-- | include/linux/io_uring.h | 18 | ||||
-rw-r--r-- | include/linux/io_uring_types.h | 10 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 16 | ||||
-rw-r--r-- | io_uring/cancel.c | 5 | ||||
-rw-r--r-- | io_uring/filetable.c | 11 | ||||
-rw-r--r-- | io_uring/filetable.h | 28 | ||||
-rw-r--r-- | io_uring/io_uring.c | 497 | ||||
-rw-r--r-- | io_uring/io_uring.h | 17 | ||||
-rw-r--r-- | io_uring/msg_ring.c | 4 | ||||
-rw-r--r-- | io_uring/net.c | 58 | ||||
-rw-r--r-- | io_uring/poll.c | 6 | ||||
-rw-r--r-- | io_uring/poll.h | 2 | ||||
-rw-r--r-- | io_uring/rsrc.c | 8 | ||||
-rw-r--r-- | io_uring/rw.c | 6 | ||||
-rw-r--r-- | io_uring/rw.h | 1 | ||||
-rw-r--r-- | io_uring/tctx.c | 31 | ||||
-rw-r--r-- | io_uring/timeout.c | 6 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 16 | ||||
-rw-r--r-- | net/socket.c | 1 |
21 files changed, 416 insertions, 334 deletions
diff --git a/block/fops.c b/block/fops.c index 18dedb730493..4faeada05b23 100644 --- a/block/fops.c +++ b/block/fops.c @@ -481,7 +481,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; @@ -494,6 +494,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (IS_ERR(bdev)) return PTR_ERR(bdev); + if (bdev_nowait(bdev)) + filp->f_mode |= FMODE_NOWAIT; + filp->private_data = bdev; filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f15e7330b75a..b97744e05551 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -521,7 +521,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, if (cookie != NULL && blk_rq_is_poll(req)) nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); else - io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); return RQ_END_IO_FREE; } @@ -543,7 +543,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, if (cookie != NULL && blk_rq_is_poll(req)) nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); else - io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb); + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); return RQ_END_IO_NONE; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 7fe31b2cd02f..bb9c666bd584 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -46,13 +46,23 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, unsigned issue_flags); -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); +/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} static inline void io_uring_files_cancel(void) { @@ -85,6 +95,10 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1b2a20a42413..f04ce513fadb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,6 +211,16 @@ struct io_ring_ctx { unsigned int compat: 1; enum task_work_notify_mode notify_method; + + /* + * If IORING_SETUP_NO_MMAP is used, then the below holds + * the gup'ed pages for the two rings, and the sqes. + */ + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; + struct io_rings *rings; struct task_struct *submitter_task; struct percpu_ref refs; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0716cb17e436..f222d263bc55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -173,6 +173,18 @@ enum { */ #define IORING_SETUP_DEFER_TASKRUN (1U << 13) +/* + * Application provides the memory for the rings + */ +#define IORING_SETUP_NO_MMAP (1U << 14) + +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -406,7 +418,7 @@ struct io_sqring_offsets { __u32 dropped; __u32 array; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -425,7 +437,7 @@ struct io_cqring_offsets { __u32 cqes; __u32 flags; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* diff --git a/io_uring/cancel.c b/io_uring/cancel.c index b4f5dfacc0c3..58c46c852bdd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -216,13 +216,10 @@ static int __io_sync_cancel(struct io_uring_task *tctx, /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { - unsigned long file_ptr; - if (unlikely(fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - cd->file = (struct file *) (file_ptr & FFS_MASK); + cd->file = io_file_from_index(&ctx->file_table, fd); if (!cd->file) return -EBADF; } diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 0f6fa791a47d..e7d749991de4 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -78,10 +78,8 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); if (file_slot->file_ptr) { - struct file *old_file; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, + io_slot_file(file_slot)); if (ret) return ret; @@ -140,7 +138,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) { struct io_fixed_file *file_slot; - struct file *file; int ret; if (unlikely(!ctx->file_data)) @@ -153,8 +150,8 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) if (!file_slot->file_ptr) return -EBADF; - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, file); + ret = io_queue_rsrc_removal(ctx->file_data, offset, + io_slot_file(file_slot)); if (ret) return ret; diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 351111ff8882..b47adf170c31 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -5,10 +5,6 @@ #include <linux/file.h> #include <linux/io_uring_types.h> -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) - bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); @@ -43,21 +39,31 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i) return &table->files[i]; } +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) + +static inline unsigned int io_slot_flags(struct io_fixed_file *slot) +{ + return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; +} + +static inline struct file *io_slot_file(struct io_fixed_file *slot) +{ + return (struct file *)(slot->file_ptr & FFS_MASK); +} + static inline struct file *io_file_from_index(struct io_file_table *table, int index) { - struct io_fixed_file *slot = io_fixed_file_slot(table, index); - - return (struct file *) (slot->file_ptr & FFS_MASK); + return io_slot_file(io_fixed_file_slot(table, index)); } static inline void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) { - unsigned long file_ptr = (unsigned long) file; - - file_ptr |= io_file_get_flags(file); - file_slot->file_ptr = file_ptr; + file_slot->file_ptr = (unsigned long)file | + (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3bca7a79efda..1b53a2ab0a27 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "timeout.h" #include "poll.h" +#include "rw.h" #include "alloc_cache.h" #define IORING_MAX_ENTRIES 32768 @@ -145,8 +146,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -static void io_dismantle_req(struct io_kiocb *req); -static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); static void io_move_task_work_from_local(struct io_ring_ctx *ctx); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -367,6 +366,39 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } +static void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & REQ_F_BUFFER_SELECTED) { + spin_lock(&req->ctx->completion_lock); + io_put_kbuf_comp(req); + spin_unlock(&req->ctx->completion_lock); + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + + if (def->cleanup) + def->cleanup(req); + } + if ((req->flags & REQ_F_POLLED) && req->apoll) { + kfree(req->apoll->double_poll); + kfree(req->apoll); + req->apoll = NULL; + } + if (req->flags & REQ_F_INFLIGHT) { + struct io_uring_task *tctx = req->task->io_uring; + + atomic_dec(&tctx->inflight_tracked); + } + if (req->flags & REQ_F_CREDS) + put_cred(req->creds); + if (req->flags & REQ_F_ASYNC_DATA) { + kfree(req->async_data); + req->async_data = NULL; + } + req->flags &= ~IO_REQ_CLEAN_FLAGS; +} + static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -423,8 +455,8 @@ static void io_prep_async_work(struct io_kiocb *req) if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; - if (req->file && !io_req_ffs_set(req)) - req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + req->flags |= io_file_get_flags(req->file); if (req->file && (req->flags & REQ_F_ISREG)) { bool should_hash = def->hash_reg_file; @@ -594,42 +626,18 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) } static inline void __io_cq_lock(struct io_ring_ctx *ctx) - __acquires(ctx->completion_lock) { if (!ctx->task_complete) spin_lock(&ctx->completion_lock); } -static inline void __io_cq_unlock(struct io_ring_ctx *ctx) -{ - if (!ctx->task_complete) - spin_unlock(&ctx->completion_lock); -} - static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { spin_lock(&ctx->completion_lock); } -static inline void io_cq_unlock(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) -{ - spin_unlock(&ctx->completion_lock); -} - -/* keep it inlined for io_submit_flush_completions() */ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) -{ - io_commit_cqring(ctx); - __io_cq_unlock(ctx); - io_commit_cqring_flush(ctx); - io_cqring_wake(ctx); -} - -static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) { io_commit_cqring(ctx); @@ -641,13 +649,13 @@ static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) */ io_commit_cqring_flush(ctx); } else { - __io_cq_unlock(ctx); + spin_unlock(&ctx->completion_lock); io_commit_cqring_flush(ctx); io_cqring_wake(ctx); } } -void io_cq_unlock_post(struct io_ring_ctx *ctx) +static void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); @@ -662,10 +670,10 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) struct io_overflow_cqe *ocqe; LIST_HEAD(list); - io_cq_lock(ctx); + spin_lock(&ctx->completion_lock); list_splice_init(&ctx->cq_overflow_list, &list); clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - io_cq_unlock(ctx); + spin_unlock(&ctx->completion_lock); while (!list_empty(&list)) { ocqe = list_first_entry(&list, struct io_overflow_cqe, list); @@ -722,29 +730,29 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) } /* can be called by any task */ -static void io_put_task_remote(struct task_struct *task, int nr) +static void io_put_task_remote(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; - percpu_counter_sub(&tctx->inflight, nr); + percpu_counter_sub(&tctx->inflight, 1); if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); - put_task_struct_many(task, nr); + put_task_struct(task); } /* used by a task to put its own references */ -static void io_put_task_local(struct task_struct *task, int nr) +static void io_put_task_local(struct task_struct *task) { - task->io_uring->cached_refs += nr; + task->io_uring->cached_refs++; } /* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static inline void io_put_task(struct task_struct *task) { if (likely(task == current)) - io_put_task_local(task, nr); + io_put_task_local(task); else - io_put_task_remote(task, nr); + io_put_task_remote(task); } void io_task_refs_refill(struct io_uring_task *tctx) @@ -934,20 +942,19 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags return __io_post_aux_cqe(ctx, user_data, res, cflags, true); } -bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, +bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, bool allow_overflow) { + struct io_ring_ctx *ctx = req->ctx; + u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; - unsigned int length; if (!defer) return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); - length = ARRAY_SIZE(ctx->submit_state.cqes); - lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == length) { + if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) { __io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ @@ -991,14 +998,18 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) } } io_put_kbuf_comp(req); - io_dismantle_req(req); + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) + io_clean_op(req); + if (!(req->flags & REQ_F_FIXED_FILE)) + io_put_file(req->file); + rsrc_node = req->rsrc_node; /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ - io_put_task_remote(req->task, 1); + io_put_task_remote(req->task); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } @@ -1111,36 +1122,13 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) return true; } -static inline void io_dismantle_req(struct io_kiocb *req) -{ - unsigned int flags = req->flags; - - if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - if (!(flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); -} - -static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (req->rsrc_node) { - io_tw_lock(ctx, ts); - io_put_rsrc_node(ctx, req->rsrc_node); - } - io_dismantle_req(req); - io_put_task_remote(req->task, 1); - - spin_lock(&ctx->completion_lock); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - spin_unlock(&ctx->completion_lock); -} - __cold void io_free_req(struct io_kiocb *req) { - req->io_task_work.func = io_free_req_tw; + /* refs were already put, restore them for io_req_task_complete() */ + req->flags &= ~REQ_F_REFCOUNT; + /* we only want to free it, don't post CQEs */ + req->flags |= REQ_F_CQE_SKIP; + req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } @@ -1205,7 +1193,9 @@ static unsigned int handle_tw_list(struct llist_node *node, ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); } - req->io_task_work.func(req, ts); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); node = next; count++; if (unlikely(need_resched())) { @@ -1303,7 +1293,7 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx) } } -static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; @@ -1354,19 +1344,11 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } -void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +static void io_req_normal_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && - (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { - rcu_read_lock(); - io_req_local_work_add(req, flags); - rcu_read_unlock(); - return; - } - /* task_work already pending, we're done */ if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; @@ -1380,6 +1362,17 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) io_fallback_tw(tctx); } +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +{ + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + rcu_read_lock(); + io_req_local_work_add(req, flags); + rcu_read_unlock(); + } else { + io_req_normal_work_add(req); + } +} + static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { struct llist_node *node; @@ -1390,7 +1383,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) io_task_work.node); node = node->next; - __io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL); + io_req_normal_work_add(req); } } @@ -1405,13 +1398,19 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: - node = io_llist_xchg(&ctx->work_llist, NULL); + /* + * llists are in reverse order, flip it back the right way before + * running the pending items. + */ + node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, ts); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); ret++; node = next; } @@ -1498,9 +1497,6 @@ void io_queue_next(struct io_kiocb *req) void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { - struct task_struct *task = NULL; - int task_refs = 0; - do { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); @@ -1530,19 +1526,10 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) io_req_put_rsrc_locked(req, ctx); - if (req->task != task) { - if (task) - io_put_task(task, task_refs); - task = req->task; - task_refs = 0; - } - task_refs++; + io_put_task(req->task); node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); - - if (task) - io_put_task(task, task_refs); } static void __io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -1570,7 +1557,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } } - __io_cq_unlock_post_flush(ctx); + __io_cq_unlock_post(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); @@ -1578,22 +1565,6 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } -/* - * Drop reference to request, return next in chain (if there is one) if this - * was the last reference to this request. - */ -static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = NULL; - - if (req_ref_put_and_test(req)) { - if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) - nxt = io_req_find_next(req); - io_free_req(req); - } - return nxt; -} - static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ @@ -1758,54 +1729,14 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -static bool io_bdev_nowait(struct block_device *bdev) -{ - return !bdev || bdev_nowait(bdev); -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static bool __io_file_supports_nowait(struct file *file, umode_t mode) -{ - if (S_ISBLK(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(I_BDEV(file->f_mapping->host))) - return true; - return false; - } - if (S_ISSOCK(mode)) - return true; - if (S_ISREG(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - !io_is_uring_fops(file)) - return true; - return false; - } - - /* any ->read/write should understand O_NONBLOCK */ - if (file->f_flags & O_NONBLOCK) - return true; - return file->f_mode & FMODE_NOWAIT; -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ unsigned int io_file_get_flags(struct file *file) { - umode_t mode = file_inode(file)->i_mode; unsigned int res = 0; - if (S_ISREG(mode)) - res |= FFS_ISREG; - if (__io_file_supports_nowait(file, mode)) - res |= FFS_NOWAIT; + if (S_ISREG(file_inode(file)->i_mode)) + res |= REQ_F_ISREG; + if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) + res |= REQ_F_SUPPORT_NOWAIT; return res; } @@ -1891,39 +1822,6 @@ queue: spin_unlock(&ctx->completion_lock); } -static void io_clean_op(struct io_kiocb *req) -{ - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); - spin_unlock(&req->ctx->completion_lock); - } - - if (req->flags & REQ_F_NEED_CLEANUP) { - const struct io_cold_def *def = &io_cold_defs[req->opcode]; - - if (def->cleanup) - def->cleanup(req); - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } - if (req->flags & REQ_F_CREDS) - put_cred(req->creds); - if (req->flags & REQ_F_ASYNC_DATA) { - kfree(req->async_data); - req->async_data = NULL; - } - req->flags &= ~IO_REQ_CLEAN_FLAGS; -} - static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, unsigned int issue_flags) { @@ -1986,9 +1884,14 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; - req = io_put_req_find_next(req); - return req ? &req->work : NULL; + if (req_ref_put_and_test(req)) { + if (req->flags & IO_REQ_LINK_FLAGS) + nxt = io_req_find_next(req); + io_free_req(req); + } + return nxt ? &nxt->work : NULL; } void io_wq_submit_work(struct io_wq_work *work) @@ -2060,19 +1963,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_fixed_file *slot; struct file *file = NULL; - unsigned long file_ptr; io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); + slot = io_fixed_file_slot(&ctx->file_table, fd); + file = io_slot_file(slot); + req->flags |= io_slot_flags(slot); io_req_set_rsrc_node(req, ctx, 0); out: io_ring_submit_unlock(ctx, issue_flags); @@ -2709,11 +2610,96 @@ static void io_mem_free(void *ptr) free_compound_page(page); } +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array; + int i; + + if (!pages) + return; + page_array = *pages; + for (i = 0; i < npages; i++) + unpin_user_page(page_array[i]); + kvfree(page_array); + *pages = NULL; +} + +static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) +{ + struct page **page_array; + unsigned int nr_pages; + int ret; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages > USHRT_MAX) + return ERR_PTR(-EINVAL); + page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!page_array) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + page_array); + if (ret != nr_pages) { +err: + io_pages_free(&page_array, ret > 0 ? ret : 0); + return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); + } + /* + * Should be a single page. If the ring is small enough that we can + * use a normal page, that is fine. If we need multiple pages, then + * userspace should use a huge page. That's the only way to guarantee + * that we get contigious memory, outside o |