diff options
author | Jens Axboe <axboe@kernel.dk> | 2022-05-09 06:34:52 -0600 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2022-05-09 06:35:11 -0600 |
commit | 1308689906ad35b017eec8e595a2beb6f2f972fb (patch) | |
tree | af2095feb8ebad82d224e0dd8dcc1b2162f57cf9 | |
parent | c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a (diff) | |
parent | 7ccba24d3bc084d891def1a6fea504e4cb327a8c (diff) | |
download | linux-1308689906ad35b017eec8e595a2beb6f2f972fb.tar.gz linux-1308689906ad35b017eec8e595a2beb6f2f972fb.tar.bz2 linux-1308689906ad35b017eec8e595a2beb6f2f972fb.zip |
Merge branch 'for-5.19/io_uring' into for-5.19/io_uring-passthrough
* for-5.19/io_uring: (85 commits)
io_uring: don't clear req->kbuf when buffer selection is done
io_uring: eliminate the need to track provided buffer ID separately
io_uring: move provided buffer state closer to submit state
io_uring: move provided and fixed buffers into the same io_kiocb area
io_uring: abstract out provided buffer list selection
io_uring: never call io_buffer_select() for a buffer re-select
io_uring: get rid of hashed provided buffer groups
io_uring: always use req->buf_index for the provided buffer group
io_uring: ignore ->buf_index if REQ_F_BUFFER_SELECT isn't set
io_uring: kill io_rw_buffer_select() wrapper
io_uring: make io_buffer_select() return the user address directly
io_uring: kill io_recv_buffer_select() wrapper
io_uring: use 'sr' vs 'req->sr_msg' consistently
io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg
io_uring: check IOPOLL/ioprio support upfront
io_uring: replace smp_mb() with smp_mb__after_atomic() in io_sq_thread()
io_uring: add IORING_SETUP_TASKRUN_FLAG
io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used
io_uring: set task_work notify method at init time
io-wq: use __set_notify_signal() to wake workers
...
-rw-r--r-- | fs/io-wq.c | 4 | ||||
-rw-r--r-- | fs/io-wq.h | 1 | ||||
-rw-r--r-- | fs/io_uring.c | 2078 | ||||
-rw-r--r-- | include/linux/sched/signal.h | 13 | ||||
-rw-r--r-- | include/linux/task_work.h | 1 | ||||
-rw-r--r-- | include/trace/events/io_uring.h | 42 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 37 | ||||
-rw-r--r-- | kernel/task_work.c | 25 |
8 files changed, 1254 insertions, 947 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c index 32aeb2c581c5..824623bcf1a5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, static bool io_wq_worker_wake(struct io_worker *worker, void *data) { - set_notify_signal(worker->task); + __set_notify_signal(worker->task); wake_up_process(worker->task); return false; } @@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, { if (work && match->fn(work, match->data)) { work->flags |= IO_WQ_WORK_CANCEL; - set_notify_signal(worker->task); + __set_notify_signal(worker->task); return true; } diff --git a/fs/io-wq.h b/fs/io-wq.h index dbecd27656c7..ba6eee76d028 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + int cancel_seq; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91de361ea9ab..9f340f44827b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -113,6 +113,9 @@ #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ + IO_REQ_CLEAN_FLAGS) + #define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_uring { @@ -166,7 +169,7 @@ struct io_rings { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 sq_flags; + atomic_t sq_flags; /* * Runtime CQ flags * @@ -220,6 +223,23 @@ struct io_overflow_cqe { struct list_head list; }; +/* + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we + * can't safely always dereference the file when the task has exited and ring + * cleanup is done. If a file is tracked and part of SCM, then unix gc on + * process exit may reap it before __io_sqe_files_unregister() is run. + */ +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#if defined(CONFIG_64BIT) +#define FFS_SCM 0x4UL +#else +#define IO_URING_SCM_ALL +#define FFS_SCM 0x0UL +#endif +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) + struct io_fixed_file { /* file * with additional FFS_* flags */ unsigned long file_ptr; @@ -262,7 +282,6 @@ struct io_rsrc_data { }; struct io_buffer_list { - struct list_head list; struct list_head buf_list; __u16 bgid; }; @@ -337,7 +356,7 @@ struct io_ev_fd { struct rcu_head rcu; }; -#define IO_BUFFERS_HASH_BITS 5 +#define BGID_ARRAY 64 struct io_ring_ctx { /* const or read-mostly hot data */ @@ -346,6 +365,7 @@ struct io_ring_ctx { struct io_rings *rings; unsigned int flags; + enum task_work_notify_mode notify_method; unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; @@ -353,6 +373,7 @@ struct io_ring_ctx { unsigned int drain_active: 1; unsigned int drain_disabled: 1; unsigned int has_evfd: 1; + unsigned int syscall_iopoll: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -382,17 +403,21 @@ struct io_ring_ctx { */ struct io_rsrc_node *rsrc_node; int rsrc_cached_refs; + atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; struct io_submit_state submit_state; + + struct io_buffer_list *io_bl; + struct xarray io_bl_xa; + struct list_head io_buffers_cache; + struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct list_head *io_buffers; - struct list_head io_buffers_cache; struct list_head apoll_cache; struct xarray personalities; u32 pers_next; @@ -409,9 +434,16 @@ struct io_ring_ctx { struct wait_queue_head sqo_sq_wait; struct list_head sqd_list; - unsigned long check_cq_overflow; + unsigned long check_cq; struct { + /* + * We cache a range of free CQEs we can use, once exhausted it + * should go through a slower range setup, see __io_get_cqe() + */ + struct io_uring_cqe *cqe_cached; + struct io_uring_cqe *cqe_sentinel; + unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; @@ -557,6 +589,8 @@ struct io_sync { struct io_cancel { struct file *file; u64 addr; + u32 flags; + s32 fd; }; struct io_timeout { @@ -602,9 +636,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; - int bgid; size_t len; size_t done_io; + unsigned int flags; }; struct io_open { @@ -862,6 +896,21 @@ enum { IORING_RSRC_BUFFER = 1, }; +struct io_cqe { + __u64 user_data; + __s32 res; + /* fd initially, then cflags for completion */ + union { + __u32 flags; + int fd; + }; +}; + +enum { + IO_CHECK_CQ_OVERFLOW_BIT, + IO_CHECK_CQ_DROPPED_BIT, +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -902,23 +951,28 @@ struct io_kiocb { u8 opcode; /* polled IO has completed */ u8 iopoll_completed; + /* + * Can be either a fixed buffer index, or used with provided buffers. + * For the latter, before issue it points to the buffer group ID, + * and after selection it points to the buffer ID itself. + */ u16 buf_index; unsigned int flags; - u64 user_data; - u32 result; - /* fd initially, then cflags for completion */ - union { - u32 cflags; - int fd; - }; + struct io_cqe cqe; struct io_ring_ctx *ctx; struct task_struct *task; - struct percpu_ref *fixed_rsrc_refs; - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + struct io_rsrc_node *rsrc_node; + + union { + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; + + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; + }; union { /* used by request caches, completion batching and iopoll */ @@ -935,8 +989,6 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ @@ -956,6 +1008,16 @@ struct io_defer_entry { u32 seq; }; +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -977,12 +1039,19 @@ struct io_op_def { unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; /* size of async data needed, if any */ unsigned short async_size; }; static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = {}, + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + }, [IORING_OP_READV] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -991,6 +1060,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { @@ -1001,6 +1072,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { @@ -1013,6 +1086,8 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { @@ -1022,6 +1097,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { @@ -1086,6 +1163,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_CLOSE] = {}, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_STATX] = { .audit_skip = 1, @@ -1097,6 +1175,8 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { @@ -1106,6 +1186,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { @@ -1140,9 +1222,11 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -1160,11 +1244,13 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_LINKAT] = {}, [IORING_OP_MSG_RING] = { .needs_file = 1, + .iopoll = 1, }, }; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); @@ -1173,10 +1259,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); - -static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req); +static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); static void io_dismantle_req(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1188,7 +1271,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); static void io_drop_inflight_file(struct io_kiocb *req); static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); -static void __io_queue_sqe(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); @@ -1201,6 +1284,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); static struct kmem_cache *req_cachep; @@ -1219,6 +1303,42 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +#if defined(CONFIG_UNIX) +static inline bool io_file_need_scm(struct file *filp) +{ +#if defined(IO_URING_SCM_ALL) + return true; +#else + return !!unix_get_socket(filp); +#endif +} +#else +static inline bool io_file_need_scm(struct file *filp) +{ + return false; +} +#endif + +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + lockdep_assert_held(&ctx->uring_lock); + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); +} + static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -1280,31 +1400,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req) #define IO_RSRC_REF_BATCH 100 +static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +{ + percpu_ref_put_many(&node->refs, nr); +} + static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct percpu_ref *ref = req->fixed_rsrc_refs; + struct io_rsrc_node *node = req->rsrc_node; - if (ref) { - if (ref == &ctx->rsrc_node->refs) + if (node) { + if (node == ctx->rsrc_node) ctx->rsrc_cached_refs++; else - percpu_ref_put(ref); + io_rsrc_put_node(node, 1); } } -static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +static inline void io_req_put_rsrc(struct io_kiocb *req) { - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); + if (req->rsrc_node) + io_rsrc_put_node(req->rsrc_node, 1); } static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { if (ctx->rsrc_cached_refs) { - percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); ctx->rsrc_cached_refs = 0; } } @@ -1320,8 +1445,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) { - if (!req->fixed_rsrc_refs) { - req->fixed_rsrc_refs = &ctx->rsrc_node->refs; + if (!req->rsrc_node) { + req->rsrc_node = ctx->rsrc_node; if (!(issue_flags & IO_URING_F_UNLOCKED)) { lockdep_assert_held(&ctx->uring_lock); @@ -1329,21 +1454,17 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, if (unlikely(ctx->rsrc_cached_refs < 0)) io_rsrc_refs_refill(ctx); } else { - percpu_ref_get(req->fixed_rsrc_refs); + percpu_ref_get(&req->rsrc_node->refs); } } } static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { - struct io_buffer *kbuf = req->kbuf; - unsigned int cflags; - - cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); req->flags &= ~REQ_F_BUFFER_SELECTED; - list_add(&kbuf->list, list); - req->kbuf = NULL; - return cflags; + list_add(&req->kbuf->list, list); + + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) @@ -1393,15 +1514,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { - struct list_head *hash_list; - struct io_buffer_list *bl; - - hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - list_for_each_entry(bl, hash_list, list) - if (bl->bgid == bgid || bgid == -1U) - return bl; + if (ctx->io_bl && bgid < BGID_ARRAY) + return &ctx->io_bl[bgid]; - return NULL; + return xa_load(&ctx->io_bl_xa, bgid); } static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) @@ -1416,19 +1532,15 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (req->flags & REQ_F_PARTIAL_IO) return; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - - lockdep_assert_held(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; - req->kbuf = NULL; + req->buf_index = buf->bgid; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, issue_flags); } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1469,7 +1581,12 @@ static inline void req_set_fail(struct io_kiocb *req) static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); - req->result = res; + req->cqe.res = res; +} + +static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1506,12 +1623,14 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int i, hash_bits; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + xa_init(&ctx->io_bl_xa); + /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -1533,13 +1652,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; - ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, - sizeof(struct list_head), GFP_KERNEL); - if (!ctx->io_buffers) - goto err; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) - INIT_LIST_HEAD(&ctx->io_buffers[i]); - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1575,7 +1687,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } @@ -1599,10 +1712,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) - static inline bool io_req_ffs_set(struct io_kiocb *req) { return req->flags & REQ_F_FIXED_FILE; @@ -1629,6 +1738,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return __io_prep_linked_timeout(req); } +static noinline void __io_arm_ltimeout(struct io_kiocb *req) +{ + io_queue_linked_timeout(__io_prep_linked_timeout(req)); +} + +static inline void io_arm_ltimeout(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) + __io_arm_ltimeout(req); +} + static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1641,6 +1761,7 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.list.next = NULL; req->work.flags = 0; + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1672,17 +1793,15 @@ static void io_prep_async_link(struct io_kiocb *req) static inline void io_req_add_compl_list(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; + struct io_submit_state *state = &req->ctx->submit_state; if (!(req->flags & REQ_F_CQE_SKIP)) - ctx->submit_state.flush_cqes = true; + state->flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) +static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -1702,8 +1821,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, - &req->work, io_wq_is_hashed(&req->work)); + trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, + req->opcode, req->flags, &req->work, + io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1721,8 +1841,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_fill_cqe_req(req, status, 0); - io_put_req_deferred(req); + io_req_tw_post_queue(req, status, 0); } } @@ -1804,21 +1923,38 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - unsigned tail, mask = ctx->cq_entries - 1; - - /* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ - if (__io_cqring_events(ctx) == ctx->cq_entries) + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int free, queued, len; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) return NULL; - tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & mask]; + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + return ctx->cqe_cached++; +} + +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +{ + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { + ctx->cached_cq_tail++; + return ctx->cqe_cached++; + } + return __io_get_cqe(ctx); } static void io_eventfd_signal(struct io_ring_ctx *ctx) @@ -1915,13 +2051,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { - clear_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - if (posted) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -1932,7 +2066,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool ret = true; - if (test_bit(0, &ctx->check_cq_overflow)) { + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); @@ -1944,19 +2078,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static void __io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; - if (likely(task == current)) { - tctx->cached_refs += nr; - } else { - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); - } + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); } static void io_task_refs_refill(struct io_uring_task *tctx) @@ -1995,6 +2133,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, struct io_overflow_cqe *ocqe; ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { /* * If we're in ring overflow flush mode, or in task cancel mode, @@ -2002,12 +2141,12 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * on the floor. */ io_account_cq_overflow(ctx); + set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } if (list_empty(&ctx->cq_overflow_list)) { - set_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); + set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } ocqe->cqe.user_data = user_data; @@ -2037,16 +2176,32 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, return io_cqring_event_overflow(ctx, user_data, res, cflags); } -static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) { - trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); - return __io_fill_cqe(req->ctx, req->user_data, res, cflags); + struct io_uring_cqe *cqe; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(*cqe)); + return true; + } + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags); } -static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags); + return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, @@ -2069,7 +2224,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, * free_list cache. */ if (req_ref_put_and_test(req)) { - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { @@ -2077,7 +2232,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, req->link = NULL; } } - io_req_put_rsrc(req, ctx); + io_req_put_rsrc(req); /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid @@ -2106,8 +2261,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, static inline void io_req_complete_state(struct io_kiocb *req, s32 res, u32 cflags) { - req->result = res; - req->cflags = cflags; + req->cqe.res = res; + req->cqe.flags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } @@ -2131,17 +2286,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res) io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); } -static void io_req_complete_fail_submit(struct io_kiocb *req) -{ - /* - * We don't submit, fail them all, for that replace hardlinks with - * normal links. Extra REQ_F_LINK is tolerated. - */ - req->flags &= ~REQ_F_HARDLINK; - req->flags |= REQ_F_LINK; - io_req_complete_failed(req, req->result); -} - /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. @@ -2152,7 +2296,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ - req->result = 0; + req->cqe.res = 0; } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, @@ -2164,19 +2308,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } -/* Returns true IFF there are requests in the cache */ -static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) { - struct io_submit_state *state = &ctx->submit_state; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) - io_flush_cached_locked_reqs(ctx, state); - return !!state->free_list.next; + return !ctx->submit_state.free_list.next; } /* @@ -2188,14 +2322,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - struct io_kiocb *req; int ret, i; - if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) - return true; + /* + * If we have more than a batch's worth of requests in our IRQ side + * locked cache, grab the lock and move them over to our submission + * side cache. + */ + if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); + i |