summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 12:30:26 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 12:30:26 -0700
commit0aa69d53ac7c30f6184f88f2e310d808b32b35a5 (patch)
tree26a666a128578dd1c46f39b432543411caa0d6eb
parent3eccc0c886b1796f95a289c9d127c8ca1a254bd5 (diff)
parentc98c81a4ac37b651be7eb9d16f562fc4acc5f867 (diff)
downloadlinux-0aa69d53ac7c30f6184f88f2e310d808b32b35a5.tar.gz
linux-0aa69d53ac7c30f6184f88f2e310d808b32b35a5.tar.bz2
linux-0aa69d53ac7c30f6184f88f2e310d808b32b35a5.zip
Merge tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "Nothing major in this release, just a bunch of cleanups and some optimizations around networking mostly. - clean up file request flags handling (Christoph) - clean up request freeing and CQ locking (Pavel) - support for using pre-registering the io_uring fd at setup time (Josh) - Add support for user allocated ring memory, rather than having the kernel allocate it. Mostly for packing rings into a huge page (me) - avoid an unnecessary double retry on receive (me) - maintain ordering for task_work, which also improves performance (me) - misc cleanups/fixes (Pavel, me)" * tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux: (39 commits) io_uring: merge conditional unlock flush helpers io_uring: make io_cq_unlock_post static io_uring: inline __io_cq_unlock io_uring: fix acquire/release annotations io_uring: kill io_cq_unlock() io_uring: remove IOU_F_TWQ_FORCE_NORMAL io_uring: don't batch task put on reqs free io_uring: move io_clean_op() io_uring: inline io_dismantle_req() io_uring: remove io_free_req_tw io_uring: open code io_put_req_find_next io_uring: add helpers to decode the fixed file file_ptr io_uring: use io_file_from_index in io_msg_grab_file io_uring: use io_file_from_index in __io_sync_cancel io_uring: return REQ_F_ flags from io_file_get_flags io_uring: remove io_req_ffs_set io_uring: remove a confusing comment above io_file_get_flags io_uring: remove the mode variable in io_file_get_flags io_uring: remove __io_file_supports_nowait io_uring: wait interruptibly for request completions on exit ...
-rw-r--r--block/fops.c5
-rw-r--r--drivers/nvme/host/ioctl.c4
-rw-r--r--include/linux/io_uring.h18
-rw-r--r--include/linux/io_uring_types.h10
-rw-r--r--include/uapi/linux/io_uring.h16
-rw-r--r--io_uring/cancel.c5
-rw-r--r--io_uring/filetable.c11
-rw-r--r--io_uring/filetable.h28
-rw-r--r--io_uring/io_uring.c497
-rw-r--r--io_uring/io_uring.h17
-rw-r--r--io_uring/msg_ring.c4
-rw-r--r--io_uring/net.c58
-rw-r--r--io_uring/poll.c6
-rw-r--r--io_uring/poll.h2
-rw-r--r--io_uring/rsrc.c8
-rw-r--r--io_uring/rw.c6
-rw-r--r--io_uring/rw.h1
-rw-r--r--io_uring/tctx.c31
-rw-r--r--io_uring/timeout.c6
-rw-r--r--io_uring/uring_cmd.c16
-rw-r--r--net/socket.c1
21 files changed, 416 insertions, 334 deletions
diff --git a/block/fops.c b/block/fops.c
index 18dedb730493..4faeada05b23 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -481,7 +481,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
* during an unstable branch.
*/
filp->f_flags |= O_LARGEFILE;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ filp->f_mode |= FMODE_BUF_RASYNC;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
@@ -494,6 +494,9 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (IS_ERR(bdev))
return PTR_ERR(bdev);
+ if (bdev_nowait(bdev))
+ filp->f_mode |= FMODE_NOWAIT;
+
filp->private_data = bdev;
filp->f_mapping = bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index f15e7330b75a..b97744e05551 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -521,7 +521,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
if (cookie != NULL && blk_rq_is_poll(req))
nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED);
else
- io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
return RQ_END_IO_FREE;
}
@@ -543,7 +543,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
if (cookie != NULL && blk_rq_is_poll(req))
nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED);
else
- io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb);
return RQ_END_IO_NONE;
}
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 7fe31b2cd02f..bb9c666bd584 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -46,13 +46,23 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
unsigned issue_flags);
-void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned));
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode);
+void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+ unsigned flags);
+/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
+void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned));
+
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+ __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
+}
static inline void io_uring_files_cancel(void)
{
@@ -85,6 +95,10 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
}
+static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+}
static inline struct sock *io_uring_get_socket(struct file *file)
{
return NULL;
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1b2a20a42413..f04ce513fadb 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -211,6 +211,16 @@ struct io_ring_ctx {
unsigned int compat: 1;
enum task_work_notify_mode notify_method;
+
+ /*
+ * If IORING_SETUP_NO_MMAP is used, then the below holds
+ * the gup'ed pages for the two rings, and the sqes.
+ */
+ unsigned short n_ring_pages;
+ unsigned short n_sqe_pages;
+ struct page **ring_pages;
+ struct page **sqe_pages;
+
struct io_rings *rings;
struct task_struct *submitter_task;
struct percpu_ref refs;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0716cb17e436..f222d263bc55 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -173,6 +173,18 @@ enum {
*/
#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
+/*
+ * Application provides the memory for the rings
+ */
+#define IORING_SETUP_NO_MMAP (1U << 14)
+
+/*
+ * Register the ring fd in itself for use with
+ * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
+ * than an fd.
+ */
+#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -406,7 +418,7 @@ struct io_sqring_offsets {
__u32 dropped;
__u32 array;
__u32 resv1;
- __u64 resv2;
+ __u64 user_addr;
};
/*
@@ -425,7 +437,7 @@ struct io_cqring_offsets {
__u32 cqes;
__u32 flags;
__u32 resv1;
- __u64 resv2;
+ __u64 user_addr;
};
/*
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index b4f5dfacc0c3..58c46c852bdd 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -216,13 +216,10 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
/* fixed must be grabbed every time since we drop the uring_lock */
if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
(cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
- unsigned long file_ptr;
-
if (unlikely(fd >= ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
- cd->file = (struct file *) (file_ptr & FFS_MASK);
+ cd->file = io_file_from_index(&ctx->file_table, fd);
if (!cd->file)
return -EBADF;
}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 0f6fa791a47d..e7d749991de4 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -78,10 +78,8 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
if (file_slot->file_ptr) {
- struct file *old_file;
-
- old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file);
+ ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+ io_slot_file(file_slot));
if (ret)
return ret;
@@ -140,7 +138,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
{
struct io_fixed_file *file_slot;
- struct file *file;
int ret;
if (unlikely(!ctx->file_data))
@@ -153,8 +150,8 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
if (!file_slot->file_ptr)
return -EBADF;
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, offset, file);
+ ret = io_queue_rsrc_removal(ctx->file_data, offset,
+ io_slot_file(file_slot));
if (ret)
return ret;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 351111ff8882..b47adf170c31 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -5,10 +5,6 @@
#include <linux/file.h>
#include <linux/io_uring_types.h>
-#define FFS_NOWAIT 0x1UL
-#define FFS_ISREG 0x2UL
-#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
-
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
void io_free_file_tables(struct io_file_table *table);
@@ -43,21 +39,31 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
return &table->files[i];
}
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
+
+static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
+{
+ return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
+}
+
+static inline struct file *io_slot_file(struct io_fixed_file *slot)
+{
+ return (struct file *)(slot->file_ptr & FFS_MASK);
+}
+
static inline struct file *io_file_from_index(struct io_file_table *table,
int index)
{
- struct io_fixed_file *slot = io_fixed_file_slot(table, index);
-
- return (struct file *) (slot->file_ptr & FFS_MASK);
+ return io_slot_file(io_fixed_file_slot(table, index));
}
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
struct file *file)
{
- unsigned long file_ptr = (unsigned long) file;
-
- file_ptr |= io_file_get_flags(file);
- file_slot->file_ptr = file_ptr;
+ file_slot->file_ptr = (unsigned long)file |
+ (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
}
static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda..1b53a2ab0a27 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,6 +95,7 @@
#include "timeout.h"
#include "poll.h"
+#include "rw.h"
#include "alloc_cache.h"
#define IORING_MAX_ENTRIES 32768
@@ -145,8 +146,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
-static void io_dismantle_req(struct io_kiocb *req);
-static void io_clean_op(struct io_kiocb *req);
static void io_queue_sqe(struct io_kiocb *req);
static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
@@ -367,6 +366,39 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
+static void io_clean_op(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_BUFFER_SELECTED) {
+ spin_lock(&req->ctx->completion_lock);
+ io_put_kbuf_comp(req);
+ spin_unlock(&req->ctx->completion_lock);
+ }
+
+ if (req->flags & REQ_F_NEED_CLEANUP) {
+ const struct io_cold_def *def = &io_cold_defs[req->opcode];
+
+ if (def->cleanup)
+ def->cleanup(req);
+ }
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ kfree(req->apoll->double_poll);
+ kfree(req->apoll);
+ req->apoll = NULL;
+ }
+ if (req->flags & REQ_F_INFLIGHT) {
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ atomic_dec(&tctx->inflight_tracked);
+ }
+ if (req->flags & REQ_F_CREDS)
+ put_cred(req->creds);
+ if (req->flags & REQ_F_ASYNC_DATA) {
+ kfree(req->async_data);
+ req->async_data = NULL;
+ }
+ req->flags &= ~IO_REQ_CLEAN_FLAGS;
+}
+
static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
@@ -423,8 +455,8 @@ static void io_prep_async_work(struct io_kiocb *req)
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
- if (req->file && !io_req_ffs_set(req))
- req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
+ if (req->file && !(req->flags & REQ_F_FIXED_FILE))
+ req->flags |= io_file_get_flags(req->file);
if (req->file && (req->flags & REQ_F_ISREG)) {
bool should_hash = def->hash_reg_file;
@@ -594,42 +626,18 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
- __acquires(ctx->completion_lock)
{
if (!ctx->task_complete)
spin_lock(&ctx->completion_lock);
}
-static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
-{
- if (!ctx->task_complete)
- spin_unlock(&ctx->completion_lock);
-}
-
static inline void io_cq_lock(struct io_ring_ctx *ctx)
__acquires(ctx->completion_lock)
{
spin_lock(&ctx->completion_lock);
}
-static inline void io_cq_unlock(struct io_ring_ctx *ctx)
- __releases(ctx->completion_lock)
-{
- spin_unlock(&ctx->completion_lock);
-}
-
-/* keep it inlined for io_submit_flush_completions() */
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
- __releases(ctx->completion_lock)
-{
- io_commit_cqring(ctx);
- __io_cq_unlock(ctx);
- io_commit_cqring_flush(ctx);
- io_cqring_wake(ctx);
-}
-
-static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
- __releases(ctx->completion_lock)
{
io_commit_cqring(ctx);
@@ -641,13 +649,13 @@ static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
*/
io_commit_cqring_flush(ctx);
} else {
- __io_cq_unlock(ctx);
+ spin_unlock(&ctx->completion_lock);
io_commit_cqring_flush(ctx);
io_cqring_wake(ctx);
}
}
-void io_cq_unlock_post(struct io_ring_ctx *ctx)
+static void io_cq_unlock_post(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{
io_commit_cqring(ctx);
@@ -662,10 +670,10 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
struct io_overflow_cqe *ocqe;
LIST_HEAD(list);
- io_cq_lock(ctx);
+ spin_lock(&ctx->completion_lock);
list_splice_init(&ctx->cq_overflow_list, &list);
clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- io_cq_unlock(ctx);
+ spin_unlock(&ctx->completion_lock);
while (!list_empty(&list)) {
ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
@@ -722,29 +730,29 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
}
/* can be called by any task */
-static void io_put_task_remote(struct task_struct *task, int nr)
+static void io_put_task_remote(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
- percpu_counter_sub(&tctx->inflight, nr);
+ percpu_counter_sub(&tctx->inflight, 1);
if (unlikely(atomic_read(&tctx->in_cancel)))
wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
+ put_task_struct(task);
}
/* used by a task to put its own references */
-static void io_put_task_local(struct task_struct *task, int nr)
+static void io_put_task_local(struct task_struct *task)
{
- task->io_uring->cached_refs += nr;
+ task->io_uring->cached_refs++;
}
/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
+static inline void io_put_task(struct task_struct *task)
{
if (likely(task == current))
- io_put_task_local(task, nr);
+ io_put_task_local(task);
else
- io_put_task_remote(task, nr);
+ io_put_task_remote(task);
}
void io_task_refs_refill(struct io_uring_task *tctx)
@@ -934,20 +942,19 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
}
-bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
+bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
bool allow_overflow)
{
+ struct io_ring_ctx *ctx = req->ctx;
+ u64 user_data = req->cqe.user_data;
struct io_uring_cqe *cqe;
- unsigned int length;
if (!defer)
return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
- length = ARRAY_SIZE(ctx->submit_state.cqes);
-
lockdep_assert_held(&ctx->uring_lock);
- if (ctx->submit_state.cqes_count == length) {
+ if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) {
__io_cq_lock(ctx);
__io_flush_post_cqes(ctx);
/* no need to flush - flush is deferred */
@@ -991,14 +998,18 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
}
}
io_put_kbuf_comp(req);
- io_dismantle_req(req);
+ if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
+ io_clean_op(req);
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ io_put_file(req->file);
+
rsrc_node = req->rsrc_node;
/*
* Selected buffer deallocation in io_clean_op() assumes that
* we don't hold ->completion_lock. Clean them here to avoid
* deadlocks.
*/
- io_put_task_remote(req->task, 1);
+ io_put_task_remote(req->task);
wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
}
@@ -1111,36 +1122,13 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
return true;
}
-static inline void io_dismantle_req(struct io_kiocb *req)
-{
- unsigned int flags = req->flags;
-
- if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
- io_clean_op(req);
- if (!(flags & REQ_F_FIXED_FILE))
- io_put_file(req->file);
-}
-
-static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (req->rsrc_node) {
- io_tw_lock(ctx, ts);
- io_put_rsrc_node(ctx, req->rsrc_node);
- }
- io_dismantle_req(req);
- io_put_task_remote(req->task, 1);
-
- spin_lock(&ctx->completion_lock);
- wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
- ctx->locked_free_nr++;
- spin_unlock(&ctx->completion_lock);
-}
-
__cold void io_free_req(struct io_kiocb *req)
{
- req->io_task_work.func = io_free_req_tw;
+ /* refs were already put, restore them for io_req_task_complete() */
+ req->flags &= ~REQ_F_REFCOUNT;
+ /* we only want to free it, don't post CQEs */
+ req->flags |= REQ_F_CQE_SKIP;
+ req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
}
@@ -1205,7 +1193,9 @@ static unsigned int handle_tw_list(struct llist_node *node,
ts->locked = mutex_trylock(&(*ctx)->uring_lock);
percpu_ref_get(&(*ctx)->refs);
}
- req->io_task_work.func(req, ts);
+ INDIRECT_CALL_2(req->io_task_work.func,
+ io_poll_task_func, io_req_rw_complete,
+ req, ts);
node = next;
count++;
if (unlikely(need_resched())) {
@@ -1303,7 +1293,7 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)
}
}
-static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
+static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev;
@@ -1354,19 +1344,11 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
}
-void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
+static void io_req_normal_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
- if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
- (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
- rcu_read_lock();
- io_req_local_work_add(req, flags);
- rcu_read_unlock();
- return;
- }
-
/* task_work already pending, we're done */
if (!llist_add(&req->io_task_work.node, &tctx->task_list))
return;
@@ -1380,6 +1362,17 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
io_fallback_tw(tctx);
}
+void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
+{
+ if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+ rcu_read_lock();
+ io_req_local_work_add(req, flags);
+ rcu_read_unlock();
+ } else {
+ io_req_normal_work_add(req);
+ }
+}
+
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{
struct llist_node *node;
@@ -1390,7 +1383,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
io_task_work.node);
node = node->next;
- __io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL);
+ io_req_normal_work_add(req);
}
}
@@ -1405,13 +1398,19 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
again:
- node = io_llist_xchg(&ctx->work_llist, NULL);
+ /*
+ * llists are in reverse order, flip it back the right way before
+ * running the pending items.
+ */
+ node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL));
while (node) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
prefetch(container_of(next, struct io_kiocb, io_task_work.node));
- req->io_task_work.func(req, ts);
+ INDIRECT_CALL_2(req->io_task_work.func,
+ io_poll_task_func, io_req_rw_complete,
+ req, ts);
ret++;
node = next;
}
@@ -1498,9 +1497,6 @@ void io_queue_next(struct io_kiocb *req)
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
{
- struct task_struct *task = NULL;
- int task_refs = 0;
-
do {
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
@@ -1530,19 +1526,10 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
io_req_put_rsrc_locked(req, ctx);
- if (req->task != task) {
- if (task)
- io_put_task(task, task_refs);
- task = req->task;
- task_refs = 0;
- }
- task_refs++;
+ io_put_task(req->task);
node = req->comp_list.next;
io_req_add_to_cache(req, ctx);
} while (node);
-
- if (task)
- io_put_task(task, task_refs);
}
static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
@@ -1570,7 +1557,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
}
}
}
- __io_cq_unlock_post_flush(ctx);
+ __io_cq_unlock_post(ctx);
if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
io_free_batch_list(ctx, state->compl_reqs.first);
@@ -1578,22 +1565,6 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
}
}
-/*
- * Drop reference to request, return next in chain (if there is one) if this
- * was the last reference to this request.
- */
-static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = NULL;
-
- if (req_ref_put_and_test(req)) {
- if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
- nxt = io_req_find_next(req);
- io_free_req(req);
- }
- return nxt;
-}
-
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{
/* See comment at the top of this file */
@@ -1758,54 +1729,14 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
}
}
-static bool io_bdev_nowait(struct block_device *bdev)
-{
- return !bdev || bdev_nowait(bdev);
-}
-
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
-static bool __io_file_supports_nowait(struct file *file, umode_t mode)
-{
- if (S_ISBLK(mode)) {
- if (IS_ENABLED(CONFIG_BLOCK) &&
- io_bdev_nowait(I_BDEV(file->f_mapping->host)))
- return true;
- return false;
- }
- if (S_ISSOCK(mode))
- return true;
- if (S_ISREG(mode)) {
- if (IS_ENABLED(CONFIG_BLOCK) &&
- io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
- !io_is_uring_fops(file))
- return true;
- return false;
- }
-
- /* any ->read/write should understand O_NONBLOCK */
- if (file->f_flags & O_NONBLOCK)
- return true;
- return file->f_mode & FMODE_NOWAIT;
-}
-
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
unsigned int io_file_get_flags(struct file *file)
{
- umode_t mode = file_inode(file)->i_mode;
unsigned int res = 0;
- if (S_ISREG(mode))
- res |= FFS_ISREG;
- if (__io_file_supports_nowait(file, mode))
- res |= FFS_NOWAIT;
+ if (S_ISREG(file_inode(file)->i_mode))
+ res |= REQ_F_ISREG;
+ if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
+ res |= REQ_F_SUPPORT_NOWAIT;
return res;
}
@@ -1891,39 +1822,6 @@ queue:
spin_unlock(&ctx->completion_lock);
}
-static void io_clean_op(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- spin_lock(&req->ctx->completion_lock);
- io_put_kbuf_comp(req);
- spin_unlock(&req->ctx->completion_lock);
- }
-
- if (req->flags & REQ_F_NEED_CLEANUP) {
- const struct io_cold_def *def = &io_cold_defs[req->opcode];
-
- if (def->cleanup)
- def->cleanup(req);
- }
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- kfree(req->apoll->double_poll);
- kfree(req->apoll);
- req->apoll = NULL;
- }
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_uring_task *tctx = req->task->io_uring;
-
- atomic_dec(&tctx->inflight_tracked);
- }
- if (req->flags & REQ_F_CREDS)
- put_cred(req->creds);
- if (req->flags & REQ_F_ASYNC_DATA) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
- req->flags &= ~IO_REQ_CLEAN_FLAGS;
-}
-
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
unsigned int issue_flags)
{
@@ -1986,9 +1884,14 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_kiocb *nxt = NULL;
- req = io_put_req_find_next(req);
- return req ? &req->work : NULL;
+ if (req_ref_put_and_test(req)) {
+ if (req->flags & IO_REQ_LINK_FLAGS)
+ nxt = io_req_find_next(req);
+ io_free_req(req);
+ }
+ return nxt ? &nxt->work : NULL;
}
void io_wq_submit_work(struct io_wq_work *work)
@@ -2060,19 +1963,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_fixed_file *slot;
struct file *file = NULL;
- unsigned long file_ptr;
io_ring_submit_lock(ctx, issue_flags);
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
- file = (struct file *) (file_ptr & FFS_MASK);
- file_ptr &= ~FFS_MASK;
- /* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+ slot = io_fixed_file_slot(&ctx->file_table, fd);
+ file = io_slot_file(slot);
+ req->flags |= io_slot_flags(slot);
io_req_set_rsrc_node(req, ctx, 0);
out:
io_ring_submit_unlock(ctx, issue_flags);
@@ -2709,11 +2610,96 @@ static void io_mem_free(void *ptr)
free_compound_page(page);
}
+static void io_pages_free(struct page ***pages, int npages)
+{
+ struct page **page_array;
+ int i;
+
+ if (!pages)
+ return;
+ page_array = *pages;
+ for (i = 0; i < npages; i++)
+ unpin_user_page(page_array[i]);
+ kvfree(page_array);
+ *pages = NULL;
+}
+
+static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
+ unsigned long uaddr, size_t size)
+{
+ struct page **page_array;
+ unsigned int nr_pages;
+ int ret;
+
+ *npages = 0;
+
+ if (uaddr & (PAGE_SIZE - 1) || !size)
+ return ERR_PTR(-EINVAL);
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_pages > USHRT_MAX)
+ return ERR_PTR(-EINVAL);
+ page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!page_array)
+ return ERR_PTR(-ENOMEM);
+
+ ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ page_array);
+ if (ret != nr_pages) {
+err:
+ io_pages_free(&page_array, ret > 0 ? ret : 0);
+ return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
+ }
+ /*
+ * Should be a single page. If the ring is small enough that we can
+ * use a normal page, that is fine. If we need multiple pages, then
+ * userspace should use a huge page. That's the only way to guarantee
+ * that we get contigious memory, outside o