summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/blk-core.c6
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/io-wq.c14
-rw-r--r--fs/io-wq.h11
-rw-r--r--fs/io_uring.c2545
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--include/linux/fs.h26
-rw-r--r--include/linux/pagemap.h43
-rw-r--r--include/linux/sched/task.h6
-rw-r--r--include/uapi/linux/io_uring.h4
-rw-r--r--mm/filemap.c91
-rw-r--r--tools/io_uring/liburing.h6
14 files changed, 1611 insertions, 1148 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 93104c7470e8..d9d632639bd1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -960,9 +960,14 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
blk_status_t status = BLK_STS_IOERR;
+ struct blk_plug *plug;
might_sleep();
+ plug = blk_mq_plug(q, bio);
+ if (plug && plug->nowait)
+ bio->bi_opf |= REQ_NOWAIT;
+
/*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
@@ -1802,6 +1807,7 @@ void blk_start_plug(struct blk_plug *plug)
INIT_LIST_HEAD(&plug->cb_list);
plug->rq_count = 0;
plug->multiple_queues = false;
+ plug->nowait = false;
/*
* Store ordering should not be needed here, since a potential
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3f94a06a0946..8ae833e00443 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1734,7 +1734,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
*/
filp->f_flags |= O_LARGEFILE;
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 841c516079a9..bb824c7cb7c7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3537,7 +3537,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return generic_file_open(inode, filp);
}
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 47c5f3aeb460..e92c4724480c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -462,6 +462,7 @@ static void io_impersonate_work(struct io_worker *worker,
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
}
static void io_assign_current_work(struct io_worker *worker,
@@ -489,7 +490,6 @@ static void io_worker_handle_work(struct io_worker *worker)
do {
struct io_wq_work *work;
- unsigned int hash;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
@@ -512,6 +512,7 @@ get_next:
/* handle a whole dependent link */
do {
struct io_wq_work *old_work, *next_hashed, *linked;
+ unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
@@ -522,10 +523,8 @@ get_next:
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
- hash = io_get_work_hash(work);
- linked = old_work = work;
- wq->do_work(&linked);
- linked = (old_work == linked) ? NULL : linked;
+ old_work = work;
+ linked = wq->do_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
@@ -542,8 +541,6 @@ get_next:
spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- /* dependent work is not hashed */
- hash = -1U;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
@@ -781,8 +778,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
- wq->do_work(&work);
- work = (work == old_work) ? NULL : work;
+ work = wq->do_work(work);
wq->free_work(old_work);
} while (work);
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 071f1a997800..ddaf9614cf9b 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -5,10 +5,10 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HASHED = 4,
- IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_NO_CANCEL = 256,
- IO_WQ_WORK_CONCURRENT = 512,
+ IO_WQ_WORK_HASHED = 2,
+ IO_WQ_WORK_UNBOUND = 4,
+ IO_WQ_WORK_NO_CANCEL = 8,
+ IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -89,6 +89,7 @@ struct io_wq_work {
struct mm_struct *mm;
const struct cred *creds;
struct fs_struct *fs;
+ unsigned long fsize;
unsigned flags;
};
@@ -101,7 +102,7 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
}
typedef void (free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work **);
+typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 493e5047e67c..2a3af95be4ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
+#include <linux/pagemap.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -226,7 +227,7 @@ struct io_ring_ctx {
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int account_mem: 1;
+ unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
@@ -319,12 +320,12 @@ struct io_ring_ctx {
spinlock_t completion_lock;
/*
- * ->poll_list is protected by the ctx->uring_lock for
+ * ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head poll_list;
+ struct list_head iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
@@ -395,6 +396,7 @@ struct io_timeout {
int flags;
u32 off;
u32 target_seq;
+ struct list_head list;
};
struct io_rw {
@@ -413,7 +415,7 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
union {
- struct user_msghdr __user *msg;
+ struct user_msghdr __user *umsg;
void __user *buf;
};
int msg_flags;
@@ -486,6 +488,12 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_completion {
+ struct file *file;
+ struct list_head list;
+ int cflags;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -503,6 +511,7 @@ struct io_async_rw {
struct iovec *iov;
ssize_t nr_segs;
ssize_t size;
+ struct wait_page_queue wpq;
};
struct io_async_ctx {
@@ -523,23 +532,18 @@ enum {
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_HEAD_BIT,
- REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
- REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
- REQ_F_MUST_PUNT_BIT,
- REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
- REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
REQ_F_TASK_PINNED_BIT,
@@ -563,8 +567,6 @@ enum {
/* head of a link */
REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
- /* already grabbed next link */
- REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -575,14 +577,8 @@ enum {
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
- /* timeout request */
- REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* must be punted even for NONBLOCK */
- REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
- /* no timeout sequence */
- REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
@@ -595,8 +591,6 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
- /* needs to queue linked timeout */
- REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* req->task is refcounted */
@@ -606,7 +600,6 @@ enum {
struct async_poll {
struct io_poll_iocb poll;
struct io_poll_iocb *double_poll;
- struct io_wq_work work;
};
/*
@@ -635,51 +628,54 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ /* use only after cleaning per-op data, see io_clean_op() */
+ struct io_completion compl;
};
struct io_async_ctx *io;
- int cflags;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
u16 buf_index;
+ u32 result;
- struct io_ring_ctx *ctx;
- struct list_head list;
- unsigned int flags;
- refcount_t refs;
- struct task_struct *task;
- unsigned long fsize;
- u64 user_data;
- u32 result;
- u32 sequence;
-
- struct list_head link_list;
+ struct io_ring_ctx *ctx;
+ unsigned int flags;
+ refcount_t refs;
+ struct task_struct *task;
+ u64 user_data;
- struct list_head inflight_entry;
+ struct list_head link_list;
- struct percpu_ref *fixed_file_refs;
+ /*
+ * 1. used with ctx->iopoll_list with reads/writes
+ * 2. to track reqs with ->files (see io_op_def::file_table)
+ */
+ struct list_head inflight_entry;
+
+ struct percpu_ref *fixed_file_refs;
+ struct callback_head task_work;
+ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+ struct hlist_node hash_node;
+ struct async_poll *apoll;
+ struct io_wq_work work;
+};
- union {
- /*
- * Only commands that never go async can use the below fields,
- * obviously. Right now only IORING_OP_POLL_ADD uses them, and
- * async armed poll handlers for regular commands. The latter
- * restore the work, if needed.
- */
- struct {
- struct callback_head task_work;
- struct hlist_node hash_node;
- struct async_poll *apoll;
- };
- struct io_wq_work work;
- };
+struct io_defer_entry {
+ struct list_head list;
+ struct io_kiocb *req;
+ u32 seq;
};
-#define IO_PLUG_THRESHOLD 2
#define IO_IOPOLL_BATCH 8
+struct io_comp_state {
+ unsigned int nr;
+ struct list_head list;
+ struct io_ring_ctx *ctx;
+};
+
struct io_submit_state {
struct blk_plug plug;
@@ -690,12 +686,16 @@ struct io_submit_state {
unsigned int free_reqs;
/*
+ * Batch completion logic
+ */
+ struct io_comp_state comp;
+
+ /*
* File reference cache
*/
struct file *file;
unsigned int fd;
unsigned int has_refs;
- unsigned int used_refs;
unsigned int ios_left;
};
@@ -723,6 +723,7 @@ struct io_op_def {
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
+ unsigned needs_fsize : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -742,6 +743,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -756,6 +758,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -808,6 +811,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
+ .needs_fsize = 1,
},
[IORING_OP_OPENAT] = {
.file_table = 1,
@@ -839,6 +843,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -881,22 +886,37 @@ static const struct io_op_def io_op_defs[] = {
},
};
-static void io_wq_submit_work(struct io_wq_work **workptr);
+enum io_mem_account {
+ ACCT_LOCKED,
+ ACCT_PINNED,
+};
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+ struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
-static int io_grab_files(struct io_kiocb *req);
-static void io_complete_rw_common(struct kiocb *kiocb, long res);
-static void io_cleanup_req(struct io_kiocb *req);
+static int io_prep_work_files(struct io_kiocb *req);
+static void __io_clean_op(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req,
- const struct io_uring_sqe *sqe);
+ const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs);
+static void io_file_put_work(struct work_struct *work);
+
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock);
+static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter);
static struct kmem_cache *req_cachep;
@@ -923,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req)
req->flags |= REQ_F_TASK_PINNED;
}
+static inline void io_clean_op(struct io_kiocb *req)
+{
+ if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+ __io_clean_op(req);
+}
+
/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
static void __io_put_req_task(struct io_kiocb *req)
{
@@ -930,7 +956,41 @@ static void __io_put_req_task(struct io_kiocb *req)
put_task_struct(req->task);
}
-static void io_file_put_work(struct work_struct *work);
+static void io_sq_thread_drop_mm(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
+{
+ if (!current->mm) {
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
+ !mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (!io_op_defs[req->opcode].needs_mm)
+ return 0;
+ return __io_sq_thread_acquire_mm(ctx);
+}
+
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+ if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+}
/*
* Note: must call io_req_init_async() for the first time you
@@ -957,6 +1017,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
complete(&ctx->ref_comp);
}
+static inline bool io_is_timeout_noseq(struct io_kiocb *req)
+{
+ return !req->timeout.off;
+}
+
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
@@ -1000,7 +1065,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
- INIT_LIST_HEAD(&ctx->poll_list);
+ INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait);
@@ -1017,18 +1082,14 @@ err:
return NULL;
}
-static inline bool __req_need_defer(struct io_kiocb *req)
+static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
- struct io_ring_ctx *ctx = req->ctx;
+ if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
+ struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail
+ return seq != ctx->cached_cq_tail
+ atomic_read(&ctx->cached_cq_overflow);
-}
-
-static inline bool req_need_defer(struct io_kiocb *req)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN))
- return __req_need_defer(req);
+ }
return false;
}
@@ -1046,28 +1107,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline void io_req_work_grab_env(struct io_kiocb *req,
- const struct io_op_def *def)
-{
- if (!req->work.mm && def->needs_mm) {
- mmgrab(current->mm);
- req->work.mm = current->mm;
- }
- if (!req->work.creds)
- req->work.creds = get_current_cred();
- if (!req->work.fs && def->needs_fs) {
- spin_lock(&current->fs->lock);
- if (!current->fs->in_exec) {
- req->work.fs = current->fs;
- req->work.fs->users++;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(&current->fs->lock);
- }
-}
-
-static inline void io_req_work_drop_env(struct io_kiocb *req)
+static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
@@ -1089,11 +1129,12 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
spin_unlock(&req->work.fs->lock);
if (fs)
free_fs_struct(fs);
+ req->work.fs = NULL;
}
+ req->flags &= ~REQ_F_WORK_INITIALIZED;
}
-static inline void io_prep_async_work(struct io_kiocb *req,
- struct io_kiocb **link)
+static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1106,18 +1147,42 @@ static inline void io_prep_async_work(struct io_kiocb *req,
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+ if (!req->work.mm && def->needs_mm) {
+ mmgrab(current->mm);
+ req->work.mm = current->mm;
+ }
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
+ if (!req->work.fs && def->needs_fs) {
+ spin_lock(&current->fs->lock);
+ if (!current->fs->in_exec) {
+ req->work.fs = current->fs;
+ req->work.fs->users++;
+ } else {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ }
+ spin_unlock(&current->fs->lock);
+ }
+ if (def->needs_fsize)
+ req->work.fsize = rlimit(RLIMIT_FSIZE);
+ else
+ req->work.fsize = RLIM_INFINITY;
+}
- io_req_work_grab_env(req, def);
+static void io_prep_async_link(struct io_kiocb *req)
+{
+ struct io_kiocb *cur;
- *link = io_prep_linked_timeout(req);
+ io_prep_async_work(req);
+ if (req->flags & REQ_F_LINK_HEAD)
+ list_for_each_entry(cur, &req->link_list, link_list)
+ io_prep_async_work(cur);
}
-static inline void io_queue_async_work(struct io_kiocb *req)
+static void __io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *link;
-
- io_prep_async_work(req, &link);
+ struct io_kiocb *link = io_prep_linked_timeout(req);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
@@ -1127,14 +1192,22 @@ static inline void io_queue_async_work(struct io_kiocb *req)
io_queue_linked_timeout(link);
}
+static void io_queue_async_work(struct io_kiocb *req)
+{
+ /* init ->work of the whole link before punting */
+ io_prep_async_link(req);
+ __io_queue_async_work(req);
+}
+
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
- atomic_inc(&req->ctx->cq_timeouts);
- list_del_init(&req->list);
+ atomic_set(&req->ctx->cq_timeouts,
+ atomic_read(&req->ctx->cq_timeouts) + 1);
+ list_del_init(&req->timeout.list);
req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
io_put_req(req);
@@ -1146,7 +1219,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
struct io_kiocb *req, *tmp;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list)
io_kill_timeout(req);
spin_unlock_irq(&ctx->completion_lock);
}
@@ -1154,13 +1227,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
do {
- struct io_kiocb *req = list_first_entry(&ctx->defer_list,
- struct io_kiocb, list);
+ struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
+ struct io_defer_entry, list);
- if (req_need_defer(req))
+ if (req_need_defer(de->req, de->seq))
break;
- list_del_init(&req->list);
- io_queue_async_work(req);
+ list_del_init(&de->list);
+ /* punt-init is done before queueing for defer */
+ __io_queue_async_work(de->req);
+ kfree(de);
} while (!list_empty(&ctx->defer_list));
}
@@ -1168,15 +1243,15 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->timeout_list)) {
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, list);
+ struct io_kiocb, timeout.list);
- if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ if (io_is_timeout_noseq(req))
break;
if (req->timeout.target_seq != ctx->cached_cq_tail
- atomic_read(&ctx->cq_timeouts))
break;
- list_del_init(&req->list);
+ list_del_init(&req->timeout.list);
io_kill_timeout(req);
}
}
@@ -1229,6 +1304,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
+static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
+{
+ if (list_empty(&ctx->cq_overflow_list)) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ }
+}
+
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
@@ -1259,13 +1343,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
break;
req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
- list);
- list_move(&req->list, &list);
+ compl.list);
+ list_move(&req->compl.list, &list);
req->flags &= ~REQ_F_OVERFLOW;
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, req->cflags);
+ WRITE_ONCE(cqe->flags, req->compl.cflags);
} else {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1273,17 +1357,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
}
io_commit_cqring(ctx);
- if (cqe) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
- }
+ io_cqring_mark_overflow(ctx);
+
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
while (!list_empty(&list)) {
- req = list_first_entry(&list, struct io_kiocb, list);
- list_del(&req->list);
+ req = list_first_entry(&list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
io_put_req(req);
}
@@ -1316,11 +1397,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
set_bit(0, &ctx->cq_check_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
+ io_clean_op(req);
req->flags |= REQ_F_OVERFLOW;
- refcount_inc(&req->refs);
req->result = res;
- req->cflags = cflags;
- list_add_tail(&req->list, &ctx->cq_overflow_list);
+ req->compl.cflags = cflags;
+ refcount_inc(&req->refs);
+ list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
}
}
@@ -1329,7 +1411,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
__io_cqring_fill_event(req, res, 0);
}
-static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
+static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
@@ -1342,9 +1424,52 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
io_cqring_ev_posted(ctx);
}
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void io_submit_flush_completions(struct io_comp_state *cs)
+{
+ struct io_ring_ctx *ctx = cs->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ while (!list_empty(&cs->list)) {
+ struct io_kiocb *req;
+
+ req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
+ __io_cqring_fill_event(req, req->result, req->compl.cflags);
+ if (!(req->flags & REQ_F_LINK_HEAD)) {
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ } else {
+ spin_unlock_irq(&ctx->completion_lock);
+ io_put_req(req);
+ spin_lock_irq(&ctx->completion_lock);
+ }
+ }
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ cs->nr = 0;
+}
+
+static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
+ struct io_comp_state *cs)
{
- __io_cqring_add_event(req, res, 0);
+ if (!cs) {
+ io_cqring_add_event(req, res, cflags);
+ io_put_req(req);
+ } else {
+ io_clean_op(req);
+ req->result = res;
+ req->compl.cflags = cflags;
+ list_add_tail(&req->compl.list, &cs->list);
+ if (++cs->nr >= 32)
+ io_submit_flush_completions(cs);
+ }
+}
+
+static void io_req_complete(struct io_kiocb *req, long res)
+{
+ __io_req_complete(req, res, 0, NULL);
}
static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -1370,11 +1495,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
- if (!state) {
- req = kmem_cache_alloc(req_cachep, gfp);
- if (unlikely(!req))
- goto fallback;
- } else if (!state->free_reqs) {
+ if (!state->free_reqs) {
size_t sz;
int ret;
@@ -1412,21 +1533,15 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static void __io_req_aux_free(struct io_kiocb *req)
+static void io_dismantle_req(struct io_kiocb *req)
{
- if (req->flags & REQ_F_NEED_CLEANUP)
- io_cleanup_req(req);
+ io_clean_op(req);
- kfree(req->io);
+ if (req->io)
+ kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- __io_put_req_task(req);
- io_req_work_drop_env(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
- __io_req_aux_free(req);
+ io_req_clean_work(req);
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
@@ -1438,57 +1553,20 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
-
- percpu_ref_put(&req->ctx->refs);
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
}
-struct req_batch {
- void *reqs[IO_IOPOLL_BATCH];
- int to_free;
- int need_iter;
-};
-
-static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
+static void __io_free_req(struct io_kiocb *req)
{
- if (!rb->to_free)
- return;
- if (rb->need_iter) {
- int i, inflight = 0;
- unsigned long flags;
-
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT)
- inflight++;
- __io_req_aux_free(req);
- }
- if (!inflight)
- goto do_free;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT) {
- list_del(&req->inflight_entry);
- if (!--inflight)
- break;
- }
- }
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ struct io_ring_ctx *ctx;
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
- }
-do_free:
- kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- percpu_ref_put_many(&ctx->refs, rb->to_free);
- rb->to_free = rb->need_iter = 0;
+ io_dismantle_req(req);
+ __io_put_req_task(req);
+ ctx = req->ctx;
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
+ percpu_ref_put(&ctx->refs);
}
static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1508,53 +1586,67 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
return false;
}
-static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static bool __io_kill_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+ bool wake_ev;
+
+ if (list_empty(&req->link_list))
+ return false;
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ if (link->opcode != IORING_OP_LINK_TIMEOUT)
+ return false;
+
+ list_del_init(&link->link_list);
+ wake_ev = io_link_cancel_timeout(link);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ return wake_ev;
+}
+
+static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- bool wake_ev = false;
+ bool wake_ev;
- /* Already got next link */
- if (req->flags & REQ_F_LINK_NEXT)
- return;
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ wake_ev = __io_kill_linked_timeout(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ wake_ev = __io_kill_linked_timeout(req);
+ }
+
+ if (wake_ev)
+ io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt;
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
* safe side.
*/
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *nxt = list_first_entry(&