summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2022-05-09 06:34:52 -0600
committerJens Axboe <axboe@kernel.dk>2022-05-09 06:35:11 -0600
commit1308689906ad35b017eec8e595a2beb6f2f972fb (patch)
treeaf2095feb8ebad82d224e0dd8dcc1b2162f57cf9
parentc5eb0a61238dd6faf37f58c9ce61c9980aaffd7a (diff)
parent7ccba24d3bc084d891def1a6fea504e4cb327a8c (diff)
downloadlinux-1308689906ad35b017eec8e595a2beb6f2f972fb.tar.gz
linux-1308689906ad35b017eec8e595a2beb6f2f972fb.tar.bz2
linux-1308689906ad35b017eec8e595a2beb6f2f972fb.zip
Merge branch 'for-5.19/io_uring' into for-5.19/io_uring-passthrough
* for-5.19/io_uring: (85 commits) io_uring: don't clear req->kbuf when buffer selection is done io_uring: eliminate the need to track provided buffer ID separately io_uring: move provided buffer state closer to submit state io_uring: move provided and fixed buffers into the same io_kiocb area io_uring: abstract out provided buffer list selection io_uring: never call io_buffer_select() for a buffer re-select io_uring: get rid of hashed provided buffer groups io_uring: always use req->buf_index for the provided buffer group io_uring: ignore ->buf_index if REQ_F_BUFFER_SELECT isn't set io_uring: kill io_rw_buffer_select() wrapper io_uring: make io_buffer_select() return the user address directly io_uring: kill io_recv_buffer_select() wrapper io_uring: use 'sr' vs 'req->sr_msg' consistently io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg io_uring: check IOPOLL/ioprio support upfront io_uring: replace smp_mb() with smp_mb__after_atomic() in io_sq_thread() io_uring: add IORING_SETUP_TASKRUN_FLAG io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used io_uring: set task_work notify method at init time io-wq: use __set_notify_signal() to wake workers ...
-rw-r--r--fs/io-wq.c4
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c2078
-rw-r--r--include/linux/sched/signal.h13
-rw-r--r--include/linux/task_work.h1
-rw-r--r--include/trace/events/io_uring.h42
-rw-r--r--include/uapi/linux/io_uring.h37
-rw-r--r--kernel/task_work.c25
8 files changed, 1254 insertions, 947 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 32aeb2c581c5..824623bcf1a5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
- set_notify_signal(worker->task);
+ __set_notify_signal(worker->task);
wake_up_process(worker->task);
return false;
}
@@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
{
if (work && match->fn(work, match->data)) {
work->flags |= IO_WQ_WORK_CANCEL;
- set_notify_signal(worker->task);
+ __set_notify_signal(worker->task);
return true;
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index dbecd27656c7..ba6eee76d028 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
struct io_wq_work {
struct io_wq_work_node list;
unsigned flags;
+ int cancel_seq;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91de361ea9ab..9f340f44827b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -113,6 +113,9 @@
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
+#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
+ IO_REQ_CLEAN_FLAGS)
+
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
struct io_uring {
@@ -166,7 +169,7 @@ struct io_rings {
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
- u32 sq_flags;
+ atomic_t sq_flags;
/*
* Runtime CQ flags
*
@@ -220,6 +223,23 @@ struct io_overflow_cqe {
struct list_head list;
};
+/*
+ * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
+ * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
+ * can't safely always dereference the file when the task has exited and ring
+ * cleanup is done. If a file is tracked and part of SCM, then unix gc on
+ * process exit may reap it before __io_sqe_files_unregister() is run.
+ */
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#if defined(CONFIG_64BIT)
+#define FFS_SCM 0x4UL
+#else
+#define IO_URING_SCM_ALL
+#define FFS_SCM 0x0UL
+#endif
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
+
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
@@ -262,7 +282,6 @@ struct io_rsrc_data {
};
struct io_buffer_list {
- struct list_head list;
struct list_head buf_list;
__u16 bgid;
};
@@ -337,7 +356,7 @@ struct io_ev_fd {
struct rcu_head rcu;
};
-#define IO_BUFFERS_HASH_BITS 5
+#define BGID_ARRAY 64
struct io_ring_ctx {
/* const or read-mostly hot data */
@@ -346,6 +365,7 @@ struct io_ring_ctx {
struct io_rings *rings;
unsigned int flags;
+ enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
@@ -353,6 +373,7 @@ struct io_ring_ctx {
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
unsigned int has_evfd: 1;
+ unsigned int syscall_iopoll: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -382,17 +403,21 @@ struct io_ring_ctx {
*/
struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
+ atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
+
+ struct io_buffer_list *io_bl;
+ struct xarray io_bl_xa;
+ struct list_head io_buffers_cache;
+
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct list_head *io_buffers;
- struct list_head io_buffers_cache;
struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
@@ -409,9 +434,16 @@ struct io_ring_ctx {
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
- unsigned long check_cq_overflow;
+ unsigned long check_cq;
struct {
+ /*
+ * We cache a range of free CQEs we can use, once exhausted it
+ * should go through a slower range setup, see __io_get_cqe()
+ */
+ struct io_uring_cqe *cqe_cached;
+ struct io_uring_cqe *cqe_sentinel;
+
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
@@ -557,6 +589,8 @@ struct io_sync {
struct io_cancel {
struct file *file;
u64 addr;
+ u32 flags;
+ s32 fd;
};
struct io_timeout {
@@ -602,9 +636,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
- int bgid;
size_t len;
size_t done_io;
+ unsigned int flags;
};
struct io_open {
@@ -862,6 +896,21 @@ enum {
IORING_RSRC_BUFFER = 1,
};
+struct io_cqe {
+ __u64 user_data;
+ __s32 res;
+ /* fd initially, then cflags for completion */
+ union {
+ __u32 flags;
+ int fd;
+ };
+};
+
+enum {
+ IO_CHECK_CQ_OVERFLOW_BIT,
+ IO_CHECK_CQ_DROPPED_BIT,
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -902,23 +951,28 @@ struct io_kiocb {
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
+ /*
+ * Can be either a fixed buffer index, or used with provided buffers.
+ * For the latter, before issue it points to the buffer group ID,
+ * and after selection it points to the buffer ID itself.
+ */
u16 buf_index;
unsigned int flags;
- u64 user_data;
- u32 result;
- /* fd initially, then cflags for completion */
- union {
- u32 cflags;
- int fd;
- };
+ struct io_cqe cqe;
struct io_ring_ctx *ctx;
struct task_struct *task;
- struct percpu_ref *fixed_rsrc_refs;
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ struct io_rsrc_node *rsrc_node;
+
+ union {
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
+
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+ };
union {
/* used by request caches, completion batching and iopoll */
@@ -935,8 +989,6 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
- struct io_buffer *kbuf;
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
struct io_kiocb *link;
/* custom credentials, valid IFF REQ_F_CREDS is set */
@@ -956,6 +1008,16 @@ struct io_defer_entry {
u32 seq;
};
+struct io_cancel_data {
+ struct io_ring_ctx *ctx;
+ union {
+ u64 data;
+ struct file *file;
+ };
+ u32 flags;
+ int seq;
+};
+
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
@@ -977,12 +1039,19 @@ struct io_op_def {
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
+ /* supports ioprio */
+ unsigned ioprio : 1;
+ /* supports iopoll */
+ unsigned iopoll : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
static const struct io_op_def io_op_defs[] = {
- [IORING_OP_NOP] = {},
+ [IORING_OP_NOP] = {
+ .audit_skip = 1,
+ .iopoll = 1,
+ },
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -991,6 +1060,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -1001,6 +1072,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
@@ -1013,6 +1086,8 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -1022,6 +1097,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
@@ -1086,6 +1163,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_CLOSE] = {},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
@@ -1097,6 +1175,8 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1106,6 +1186,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
@@ -1140,9 +1222,11 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -1160,11 +1244,13 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_LINKAT] = {},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
+ .iopoll = 1,
},
};
/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
static bool io_disarm_next(struct io_kiocb *req);
static void io_uring_del_tctx_node(unsigned long index);
@@ -1173,10 +1259,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
-
-static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req);
+static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
static void io_dismantle_req(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1188,7 +1271,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
static void io_drop_inflight_file(struct io_kiocb *req);
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
-static void __io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
@@ -1201,6 +1284,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
static void io_eventfd_signal(struct io_ring_ctx *ctx);
+static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
static struct kmem_cache *req_cachep;
@@ -1219,6 +1303,42 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#if defined(CONFIG_UNIX)
+static inline bool io_file_need_scm(struct file *filp)
+{
+#if defined(IO_URING_SCM_ALL)
+ return true;
+#else
+ return !!unix_get_socket(filp);
+#endif
+}
+#else
+static inline bool io_file_need_scm(struct file *filp)
+{
+ return false;
+}
+#endif
+
+static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+ lockdep_assert_held(&ctx->uring_lock);
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+ /*
+ * "Normal" inline submissions always hold the uring_lock, since we
+ * grab it from the system call. Same is true for the SQPOLL offload.
+ * The only exception is when we've detached the request and issue it
+ * from an async worker thread, grab the lock for that case.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
+}
+
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
@@ -1280,31 +1400,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
#define IO_RSRC_REF_BATCH 100
+static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
+{
+ percpu_ref_put_many(&node->refs, nr);
+}
+
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- struct percpu_ref *ref = req->fixed_rsrc_refs;
+ struct io_rsrc_node *node = req->rsrc_node;
- if (ref) {
- if (ref == &ctx->rsrc_node->refs)
+ if (node) {
+ if (node == ctx->rsrc_node)
ctx->rsrc_cached_refs++;
else
- percpu_ref_put(ref);
+ io_rsrc_put_node(node, 1);
}
}
-static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc(struct io_kiocb *req)
{
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
+ if (req->rsrc_node)
+ io_rsrc_put_node(req->rsrc_node, 1);
}
static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
if (ctx->rsrc_cached_refs) {
- percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
ctx->rsrc_cached_refs = 0;
}
}
@@ -1320,8 +1445,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned int issue_flags)
{
- if (!req->fixed_rsrc_refs) {
- req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
+ if (!req->rsrc_node) {
+ req->rsrc_node = ctx->rsrc_node;
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
lockdep_assert_held(&ctx->uring_lock);
@@ -1329,21 +1454,17 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
if (unlikely(ctx->rsrc_cached_refs < 0))
io_rsrc_refs_refill(ctx);
} else {
- percpu_ref_get(req->fixed_rsrc_refs);
+ percpu_ref_get(&req->rsrc_node->refs);
}
}
}
static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
- struct io_buffer *kbuf = req->kbuf;
- unsigned int cflags;
-
- cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- list_add(&kbuf->list, list);
- req->kbuf = NULL;
- return cflags;
+ list_add(&req->kbuf->list, list);
+
+ return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
@@ -1393,15 +1514,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
- struct list_head *hash_list;
- struct io_buffer_list *bl;
-
- hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- list_for_each_entry(bl, hash_list, list)
- if (bl->bgid == bgid || bgid == -1U)
- return bl;
+ if (ctx->io_bl && bgid < BGID_ARRAY)
+ return &ctx->io_bl[bgid];
- return NULL;
+ return xa_load(&ctx->io_bl_xa, bgid);
}
static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
@@ -1416,19 +1532,15 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
if (req->flags & REQ_F_PARTIAL_IO)
return;
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_lock(&ctx->uring_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, issue_flags);
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->kbuf = NULL;
+ req->buf_index = buf->bgid;
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_unlock(&ctx->uring_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -1469,7 +1581,12 @@ static inline void req_set_fail(struct io_kiocb *req)
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
req_set_fail(req);
- req->result = res;
+ req->cqe.res = res;
+}
+
+static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1506,12 +1623,14 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int i, hash_bits;
+ int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ xa_init(&ctx->io_bl_xa);
+
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
@@ -1533,13 +1652,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
- ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
- sizeof(struct list_head), GFP_KERNEL);
- if (!ctx->io_buffers)
- goto err;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
- INIT_LIST_HEAD(&ctx->io_buffers[i]);
-
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1575,7 +1687,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
}
@@ -1599,10 +1712,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_NOWAIT 0x1UL
-#define FFS_ISREG 0x2UL
-#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
-
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
return req->flags & REQ_F_FIXED_FILE;
@@ -1629,6 +1738,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return __io_prep_linked_timeout(req);
}
+static noinline void __io_arm_ltimeout(struct io_kiocb *req)
+{
+ io_queue_linked_timeout(__io_prep_linked_timeout(req));
+}
+
+static inline void io_arm_ltimeout(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
+ __io_arm_ltimeout(req);
+}
+
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1641,6 +1761,7 @@ static void io_prep_async_work(struct io_kiocb *req)
req->work.list.next = NULL;
req->work.flags = 0;
+ req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -1672,17 +1793,15 @@ static void io_prep_async_link(struct io_kiocb *req)
static inline void io_req_add_compl_list(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
+ struct io_submit_state *state = &req->ctx->submit_state;
if (!(req->flags & REQ_F_CQE_SKIP))
- ctx->submit_state.flush_cqes = true;
+ state->flush_cqes = true;
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}
-static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
+static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
@@ -1702,8 +1821,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
- &req->work, io_wq_is_hashed(&req->work));
+ trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
+ req->opcode, req->flags, &req->work,
+ io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1721,8 +1841,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_fill_cqe_req(req, status, 0);
- io_put_req_deferred(req);
+ io_req_tw_post_queue(req, status, 0);
}
}
@@ -1804,21 +1923,38 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+/*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
- unsigned tail, mask = ctx->cq_entries - 1;
-
- /*
- * writes to the cq entry need to come after reading head; the
- * control dependency is enough as we're using WRITE_ONCE to
- * fill the cq entry
- */
- if (__io_cqring_events(ctx) == ctx->cq_entries)
+ unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
+ unsigned int free, queued, len;
+
+ /* userspace may cheat modifying the tail, be safe and do min */
+ queued = min(__io_cqring_events(ctx), ctx->cq_entries);
+ free = ctx->cq_entries - queued;
+ /* we need a contiguous range, limit based on the current array offset */
+ len = min(free, ctx->cq_entries - off);
+ if (!len)
return NULL;
- tail = ctx->cached_cq_tail++;
- return &rings->cqes[tail & mask];
+ ctx->cached_cq_tail++;
+ ctx->cqe_cached = &rings->cqes[off];
+ ctx->cqe_sentinel = ctx->cqe_cached + len;
+ return ctx->cqe_cached++;
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+ if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
+ ctx->cached_cq_tail++;
+ return ctx->cqe_cached++;
+ }
+ return __io_get_cqe(ctx);
}
static void io_eventfd_signal(struct io_ring_ctx *ctx)
@@ -1915,13 +2051,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
- clear_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
+ clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
+ atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
- if (posted)
- io_commit_cqring(ctx);
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
@@ -1932,7 +2066,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
bool ret = true;
- if (test_bit(0, &ctx->check_cq_overflow)) {
+ if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
@@ -1944,19 +2078,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
return ret;
}
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
+static void __io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
- if (likely(task == current)) {
- tctx->cached_refs += nr;
- } else {
- percpu_counter_sub(&tctx->inflight, nr);
- if (unlikely(atomic_read(&tctx->in_idle)))
- wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
- }
+ percpu_counter_sub(&tctx->inflight, nr);
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ wake_up(&tctx->wait);
+ put_task_struct_many(task, nr);
+}
+
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+ if (likely(task == current))
+ task->io_uring->cached_refs += nr;
+ else
+ __io_put_task(task, nr);
}
static void io_task_refs_refill(struct io_uring_task *tctx)
@@ -1995,6 +2133,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
struct io_overflow_cqe *ocqe;
ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+ trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
@@ -2002,12 +2141,12 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
* on the floor.
*/
io_account_cq_overflow(ctx);
+ set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+ set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
+ atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
ocqe->cqe.user_data = user_data;
@@ -2037,16 +2176,32 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
-static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
- trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
- return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+ struct io_uring_cqe *cqe;
+
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags);
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(*cqe));
+ return true;
+ }
+ return io_cqring_event_overflow(ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags);
}
-static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
+ return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
@@ -2069,7 +2224,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
* free_list cache.
*/
if (req_ref_put_and_test(req)) {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ if (req->flags & IO_REQ_LINK_FLAGS) {
if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
if (req->link) {
@@ -2077,7 +2232,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
req->link = NULL;
}
}
- io_req_put_rsrc(req, ctx);
+ io_req_put_rsrc(req);
/*
* Selected buffer deallocation in io_clean_op() assumes that
* we don't hold ->completion_lock. Clean them here to avoid
@@ -2106,8 +2261,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
u32 cflags)
{
- req->result = res;
- req->cflags = cflags;
+ req->cqe.res = res;
+ req->cqe.flags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
@@ -2131,17 +2286,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res)
io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
-static void io_req_complete_fail_submit(struct io_kiocb *req)
-{
- /*
- * We don't submit, fail them all, for that replace hardlinks with
- * normal links. Extra REQ_F_LINK is tolerated.
- */
- req->flags &= ~REQ_F_HARDLINK;
- req->flags |= REQ_F_LINK;
- io_req_complete_failed(req, req->result);
-}
-
/*
* Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations.
@@ -2152,7 +2296,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
req->link = NULL;
req->async_data = NULL;
/* not necessary, but safer to zero */
- req->result = 0;
+ req->cqe.res = 0;
}
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
@@ -2164,19 +2308,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock);
}
-/* Returns true IFF there are requests in the cache */
-static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
+static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
{
- struct io_submit_state *state = &ctx->submit_state;
-
- /*
- * If we have more than a batch's worth of requests in our IRQ side
- * locked cache, grab the lock and move them over to our submission
- * side cache.
- */
- if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
- io_flush_cached_locked_reqs(ctx, state);
- return !!state->free_list.next;
+ return !ctx->submit_state.free_list.next;
}
/*
@@ -2188,14 +2322,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
void *reqs[IO_REQ_ALLOC_BATCH];
- struct io_kiocb *req;
int ret, i;
- if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
- return true;
+ /*
+ * If we have more than a batch's worth of requests in our IRQ side
+ * locked cache, grab the lock and move them over to our submission
+ * side cache.
+ */
+ if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
+ io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
+ i