diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 1251 |
1 files changed, 967 insertions, 284 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 4715980e9015..5fa736344b67 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -63,6 +63,7 @@ #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> +#include <net/busy_poll.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -263,11 +264,18 @@ struct io_rsrc_data { bool quiesce; }; +struct io_buffer_list { + struct list_head list; + struct list_head buf_list; + __u16 bgid; +}; + struct io_buffer { struct list_head list; __u64 addr; __u32 len; __u16 bid; + __u16 bgid; }; struct io_restriction { @@ -326,6 +334,14 @@ struct io_submit_state { struct blk_plug plug; }; +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; +}; + +#define IO_BUFFERS_HASH_BITS 5 + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -335,11 +351,11 @@ struct io_ring_ctx { unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; - unsigned int eventfd_async: 1; unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int drain_disabled: 1; + unsigned int has_evfd: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -378,7 +394,9 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct xarray io_buffers; + struct list_head *io_buffers; + struct list_head io_buffers_cache; + struct list_head apoll_cache; struct xarray personalities; u32 pers_next; unsigned sq_thread_idle; @@ -395,11 +413,16 @@ struct io_ring_ctx { struct list_head sqd_list; unsigned long check_cq_overflow; +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + struct list_head napi_list; + spinlock_t napi_lock; /* napi_list lock */ +#endif struct { unsigned cached_cq_tail; unsigned cq_entries; - struct eventfd_ctx *cq_ev_fd; + struct io_ev_fd __rcu *io_ev_fd; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; @@ -421,6 +444,8 @@ struct io_ring_ctx { struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; + + struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; struct io_restriction restrictions; @@ -436,6 +461,8 @@ struct io_ring_ctx { struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; }; /* Keep this last, we don't need it for the fast path */ @@ -461,6 +488,11 @@ struct io_ring_ctx { }; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + struct io_uring_task { /* submission side */ int cached_refs; @@ -476,6 +508,7 @@ struct io_uring_task { struct io_wq_work_list task_list; struct io_wq_work_list prior_task_list; struct callback_head task_work; + struct file **registered_rings; bool task_running; }; @@ -690,6 +723,12 @@ struct io_hardlink { int flags; }; +struct io_msg { + struct file *file; + u64 user_data; + u32 len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -741,6 +780,8 @@ enum { REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, REQ_F_SKIP_LINK_CQES_BIT, + REQ_F_SINGLE_POLL_BIT, + REQ_F_DOUBLE_POLL_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -799,6 +840,10 @@ enum { REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* single poll may be active */ + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + /* double poll may active */ + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), }; struct async_poll { @@ -825,7 +870,7 @@ enum { * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, - * or directly as just 'ki_filp' in this struct. + * or directly as just 'file' in this struct. */ struct io_kiocb { union { @@ -855,6 +900,7 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; + struct io_msg msg; }; u8 opcode; @@ -877,6 +923,7 @@ struct io_kiocb { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; atomic_t refs; + atomic_t poll_refs; struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ @@ -885,12 +932,11 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - struct io_wq_work work; /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - atomic_t poll_refs; + const struct cred *creds; + struct io_wq_work work; }; struct io_tctx_node { @@ -1105,6 +1151,9 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MKDIRAT] = {}, [IORING_OP_SYMLINKAT] = {}, [IORING_OP_LINKAT] = {}, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -1141,6 +1190,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); +static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; @@ -1267,36 +1317,88 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } -static unsigned int __io_put_kbuf(struct io_kiocb *req) +static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { struct io_buffer *kbuf = req->kbuf; unsigned int cflags; - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; + cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); req->flags &= ~REQ_F_BUFFER_SELECTED; - kfree(kbuf); + list_add(&kbuf->list, list); req->kbuf = NULL; return cflags; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req) +static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbuf(req, &req->ctx->io_buffers_comp); +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) { + unsigned int cflags; + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbuf(req); + + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); + } + + return cflags; } -static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) +static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) { - bool got = percpu_ref_tryget(ref); + struct list_head *hash_list; + struct io_buffer_list *bl; - /* already at zero, wait for ->release() */ - if (!got) - wait_for_completion(compl); - percpu_ref_resurrect(ref); - if (got) - percpu_ref_put(ref); + hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + list_for_each_entry(bl, hash_list, list) + if (bl->bgid == bgid || bgid == -1U) + return bl; + + return NULL; +} + +static void io_kbuf_recycle(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + struct io_buffer *buf; + + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + + lockdep_assert_held(&ctx->uring_lock); + + buf = req->kbuf; + bl = io_buffer_get_list(ctx, buf->bgid); + list_add(&buf->list, &bl->buf_list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1409,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int hash_bits; + int i, hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -1436,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; + ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, + sizeof(struct list_head), GFP_KERNEL); + if (!ctx->io_buffers) + goto err; + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) + INIT_LIST_HEAD(&ctx->io_buffers[i]); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1444,14 +1553,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); + INIT_LIST_HEAD(&ctx->io_buffers_cache); + INIT_LIST_HEAD(&ctx->apoll_cache); init_completion(&ctx->ref_comp); - xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); + INIT_LIST_HEAD(&ctx->io_buffers_pages); + INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1464,10 +1576,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); +#ifdef CONFIG_NET_RX_BUSY_POLL + INIT_LIST_HEAD(&ctx->napi_list); + spin_lock_init(&ctx->napi_lock); +#endif return ctx; err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); + kfree(ctx->io_buffers); kfree(ctx); return NULL; } @@ -1610,8 +1727,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); + trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, + &req->work, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1681,22 +1798,27 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); -} - static inline void io_commit_cqring(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) - __io_commit_cqring_flush(ctx); /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); +} + static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -1726,23 +1848,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return &rings->cqes[tail & mask]; } -static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +static void io_eventfd_signal(struct io_ring_ctx *ctx) { - if (likely(!ctx->cq_ev_fd)) - return false; + struct io_ev_fd *ev_fd; + + rcu_read_lock(); + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + goto out; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return false; - return !ctx->eventfd_async || io_wq_current_is_worker(); + goto out; + + if (!ev_fd->eventfd_async || io_wq_current_is_worker()) + eventfd_signal(ev_fd->cq_ev_fd, 1); +out: + rcu_read_unlock(); } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * wake_up_all() may seem excessive, but io_wake_function() and @@ -1751,21 +1884,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); +} + +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); + + io_cqring_wake(ctx); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); - } - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_wake(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -1905,8 +2049,6 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, { struct io_uring_cqe *cqe; - trace_io_uring_complete(ctx, user_data, res, cflags); - /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -1922,16 +2064,23 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, return io_cqring_event_overflow(ctx, user_data, res, cflags); } +static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +{ + trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); + return __io_fill_cqe(req->ctx, req->user_data, res, cflags); +} + static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req->ctx, req->user_data, res, cflags); + __io_fill_cqe_req(req, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; + trace_io_uring_complete(ctx, NULL, user_data, res, cflags); return __io_fill_cqe(ctx, user_data, res, cflags); } @@ -1941,7 +2090,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, res, cflags); + __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -2000,7 +2149,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res) static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, 0); + io_req_complete_post(req, res, io_put_kbuf(req, 0)); } static void io_req_complete_fail_submit(struct io_kiocb *req) @@ -2183,7 +2332,9 @@ static void io_fail_links(struct io_kiocb *req) nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req, link); + trace_io_uring_fail_link(req->ctx, req, req->user_data, + req->opcode, link); + if (!ignore_cqes) { link->flags &= ~REQ_F_CQE_SKIP; io_fill_cqe_req(link, res, 0); @@ -2302,7 +2453,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (likely(*uring_locked)) req->io_task_work.func(req, uring_locked); else - __io_req_complete_post(req, req->result, io_put_kbuf(req)); + __io_req_complete_post(req, req->result, + io_put_kbuf_comp(req)); node = next; } while (node); @@ -2530,8 +2682,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) comp_list); if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, req->result, - req->cflags); + __io_fill_cqe_req(req, req->result, req->cflags); + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } } io_commit_cqring(ctx); @@ -2653,7 +2813,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req)); + __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); nr_events++; } @@ -2829,14 +2989,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - unsigned int cflags = io_put_kbuf(req); int res = req->result; if (*locked) { - io_req_complete_state(req, res, cflags); + io_req_complete_state(req, res, io_put_kbuf(req, 0)); io_req_add_compl_list(req); } else { - io_req_complete_post(req, res, cflags); + io_req_complete_post(req, res, + io_put_kbuf(req, IO_URING_F_UNLOCKED)); } } @@ -2845,7 +3005,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, + io_put_kbuf(req, issue_flags)); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -3000,14 +3161,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1) { - if (!(file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = file->f_pos; - } else { - kiocb->ki_pos = 0; - } - } kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) @@ -3074,6 +3227,24 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + bool is_stream = req->file->f_mode & FMODE_STREAM; + + if (kiocb->ki_pos == -1) { + if (!is_stream) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + return &kiocb->ki_pos; + } else { + kiocb->ki_pos = 0; + return NULL; + } + } + return is_stream ? NULL : &kiocb->ki_pos; +} + static void kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { @@ -3096,14 +3267,10 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) { + if (io_resubmit_prep(req)) io_req_task_queue_reissue(req); - } else { - req_set_fail(req); - req->result = ret; - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); - } + else + io_req_task_queue_fail(req, ret); } } @@ -3201,30 +3368,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) mutex_lock(&ctx->uring_lock); } +static void io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + struct list_head *list; + + list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + INIT_LIST_HEAD(&bl->buf_list); + bl->bgid = bgid; + list_add(&bl->list, list); +} + static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, int bgid, unsigned int issue_flags) { struct io_buffer *kbuf = req->kbuf; - struct io_buffer *head; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; if (req->flags & REQ_F_BUFFER_SELECTED) return kbuf; - io_ring_submit_lock(req->ctx, needs_lock); + io_ring_submit_lock(ctx, needs_lock); - lockdep_assert_held(&req->ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); - head = xa_load(&req->ctx->io_buffers, bgid); - if (head) { - if (!list_empty(&head->list)) { - kbuf = list_last_entry(&head->list, struct io_buffer, - list); - list_del(&kbuf->list); - } else { - kbuf = head; - xa_erase(&req->ctx->io_buffers, bgid); - } + bl = io_buffer_get_list(ctx, bgid); + if (bl && !list_empty(&bl->buf_list)) { + kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; @@ -3400,6 +3573,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) struct kiocb *kiocb = &req->rw.kiocb; struct file *file = req->file; ssize_t ret = 0; + loff_t *ppos; /* * Don't support polled IO through this interface, and we can't @@ -3412,6 +3586,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + ppos = io_kiocb_ppos(kiocb); + while (iov_iter_count(iter)) { struct iovec iovec; ssize_t nr; @@ -3425,10 +3601,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } if (nr < 0) { @@ -3436,13 +3612,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) ret = nr; break; } + ret += nr; if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.len -= nr; req->rw.addr += nr; + req->rw.len -= nr; + if (!req->rw.len) + break; } - ret += nr; if (nr != iovec.iov_len) break; } @@ -3629,12 +3807,23 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_async_rw *rw; ssize_t ret, ret2; + loff_t *ppos; if (!req_has_async_data(req)) { ret = io_import_iovec(READ, req, &iovec, s, issue_flags); if (unlikely(ret < 0)) return ret; } else { + /* + * Safe and required to re-import if we're using provided + * buffers, as we dropped the selected one before retry. + */ + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } + rw = req->async_data; s = &rw->s; /* @@ -3659,7 +3848,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(READ, req->file, ppos, req->result); if (unlikely(ret)) { kfree(iovec); return ret; @@ -3669,6 +3860,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; + /* if we can poll, just do that */ + if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) + return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -3758,6 +3952,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) struct kiocb *kiocb = &req->rw.kiocb; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; + loff_t *ppos; if (!req_has_async_data(req)) { ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); @@ -3788,7 +3983,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(WRITE, req->file, ppos, req->result); if (unlikely(ret)) goto out_free; @@ -4235,6 +4432,45 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_msg_ring_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || + sqe->splice_fd_in || sqe->buf_index || sqe->personality)) + return -EINVAL; + + if (req->file->f_op != &io_uring_fops) + return -EBADFD; + + req->msg.user_data = READ_ONCE(sqe->off); + req->msg.len = READ_ONCE(sqe->len); + return 0; +} + +static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx; + struct io_msg *msg = &req->msg; + int ret = -EOVERFLOW; + bool filled; + + target_ctx = req->file->private_data; + + spin_lock(&target_ctx->completion_lock); + filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, + IORING_CQE_F_MSG); + io_commit_cqring(target_ctx); + spin_unlock(&target_ctx->completion_lock); + + if (filled) { + io_cqring_ev_posted(target_ctx); + ret = 0; + } + + __io_req_complete(req, issue_flags, ret, 0); + return 0; +} + static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -4458,8 +4694,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, return 0; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, - int bgid, unsigned nbufs) +static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) { unsigned i = 0; @@ -4468,19 +4704,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return 0; /* the head kbuf is the list itself */ - while (!list_empty(&buf->list)) { + while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; - nxt = list_first_entry(&buf->list, struct io_buffer, list); + nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); - kfree(nxt); if (++i == nbufs) return i; cond_resched(); } i++; - kfree(buf); - xa_erase(&ctx->io_buffers, bgid); return i; } @@ -4489,7 +4722,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4498,9 +4731,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); ret = -ENOENT; - head = xa_load(&ctx->io_buffers, p->bgid); - if (head) - ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + bl = io_buffer_get_list(ctx, p->bgid); + if (bl) + ret = __io_remove_buffers(ctx, bl, p->nbufs); if (ret < 0) req_set_fail(req); @@ -4545,39 +4778,80 @@ static int io_provide_buffers_prep(struct io_kiocb *req, return 0; } -static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +static int io_refill_buffer_cache(struct io_ring_ctx *ctx) +{ + struct io_buffer *buf; + struct page *page; + int bufs_in_page; + + /* + * Completions that don't happen inline (eg not under uring_lock) will + * add to ->io_buffers_comp. If we don't have any free buffers, check + * the completion list and splice those entries first. + */ + if (!list_empty_careful(&ctx->io_buffers_comp)) { + spin_lock(&ctx->completion_lock); + if (!list_empty(&ctx->io_buffers_comp)) { + list_splice_init(&ctx->io_buffers_comp, + &ctx->io_buffers_cache); + spin_unlock(&ctx->completion_lock); + return 0; + } + spin_unlock(&ctx->completion_lock); + } + + /* + * No free buffers and no completion entries either. Allocate a new + * page worth of buffer entries and add those to our freelist. + */ + page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!page) + return -ENOMEM; + + list_add(&page->lru, &ctx->io_buffers_pages); + + buf = page_address(page); + bufs_in_page = PAGE_SIZE / sizeof(*buf); + while (bufs_in_page) { + list_add_tail(&buf->list, &ctx->io_buffers_cache); + buf++; + bufs_in_page--; + } + + return 0; +} + +static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, + struct io_buffer_list *bl) { struct io_buffer *buf; u64 addr = pbuf->addr; int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); - if (!buf) + if (list_empty(&ctx->io_buffers_cache) && + io_refill_buffer_cache(ctx)) break; - + buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, + list); + list_move_tail(&buf->list, &bl->buf_list); buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; + buf->bgid = pbuf->bgid; addr += pbuf->len; bid++; - if (!*head) { - INIT_LIST_HEAD(&buf->list); - *head = buf; - } else { - list_add_tail(&buf->list, &(*head)->list); - } cond_resched(); } - return i ? i : -ENOMEM; + return i ? 0 : -ENOMEM; } static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head, *list; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4585,14 +4859,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); - list = head = xa_load(&ctx->io_buffers, p->bgid); - - ret = io_add_buffers(p, &head); - if (ret >= 0 && !list) { - ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); - if (ret < 0) - __io_remove_buffers(ctx, head, p->bgid, -1U); + bl = io_buffer_get_list(ctx, p->bgid); + if (unlikely(!bl)) { + bl = kmalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) { + ret = -ENOMEM; + goto err; + } + io_buffer_add_list(ctx, bl, p->bgid); } + + ret = io_add_buffers(ctx, p, bl); +err: if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ @@ -5184,7 +5462,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5239,7 +5517,8 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) out_free: req_set_fail(req); } - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); + + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); |
