diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-21 16:24:45 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-21 16:24:45 -0700 |
| commit | af472a9efdf65cbb3398cb6478ec0e89fbc84109 (patch) | |
| tree | 11ec956e35851d6b579cbab5c555f70598adc52c | |
| parent | 93e220a62da36f766b3188e76e234607e41488f9 (diff) | |
| parent | 5e929367468c8f97cd1ffb0417316cecfebef94b (diff) | |
| download | linux-af472a9efdf65cbb3398cb6478ec0e89fbc84109.tar.gz linux-af472a9efdf65cbb3398cb6478ec0e89fbc84109.tar.bz2 linux-af472a9efdf65cbb3398cb6478ec0e89fbc84109.zip | |
Merge tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe:
- Fixes for current file position. Still doesn't have the f_pos_lock
sorted, but it's a step in the right direction (Dylan)
- Tracing updates (Dylan, Stefan)
- Improvements to io-wq locking (Hao)
- Improvements for provided buffers (me, Pavel)
- Support for registered file descriptors (me, Xiaoguang)
- Support for ring messages (me)
- Poll improvements (me)
- Fix for fixed buffers and non-iterator reads/writes (me)
- Support for NAPI on sockets (Olivier)
- Ring quiesce improvements (Usama)
- Misc fixes (Olivier, Pavel)
* tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block: (42 commits)
io_uring: terminate manual loop iterator loop correctly for non-vecs
io_uring: don't check unrelated req->open.how in accept request
io_uring: manage provided buffers strictly ordered
io_uring: fold evfd signalling under a slower path
io_uring: thin down io_commit_cqring()
io_uring: shuffle io_eventfd_signal() bits around
io_uring: remove extra barrier for non-sqpoll iopoll
io_uring: fix provided buffer return on failure for kiocb_done()
io_uring: extend provided buf return to fails
io_uring: refactor timeout cancellation cqe posting
io_uring: normilise naming for fill_cqe*
io_uring: cache poll/double-poll state with a request flag
io_uring: cache req->apoll->events in req->cflags
io_uring: move req->poll_refs into previous struct hole
io_uring: make tracing format consistent
io_uring: recycle apoll_poll entries
io_uring: remove duplicated member check for io_msg_ring_prep()
io_uring: allow submissions to continue on error
io_uring: recycle provided buffers if request goes async
io_uring: ensure reads re-import for selected buffers
...
| -rw-r--r-- | fs/io-wq.c | 114 | ||||
| -rw-r--r-- | fs/io_uring.c | 1251 | ||||
| -rw-r--r-- | include/linux/io_uring.h | 5 | ||||
| -rw-r--r-- | include/trace/events/io_uring.h | 333 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 17 |
5 files changed, 1200 insertions, 520 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c index bb7f161bb19c..5b93fa67d346 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -76,6 +76,7 @@ struct io_wqe_acct { unsigned max_workers; int index; atomic_t nr_running; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; }; @@ -91,7 +92,7 @@ enum { */ struct io_wqe { raw_spinlock_t lock; - struct io_wqe_acct acct[2]; + struct io_wqe_acct acct[IO_WQ_ACCT_NR]; int node; @@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker) if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - preempt_disable(); + raw_spin_unlock(&wqe->lock); io_wqe_dec_running(worker); worker->flags = 0; + preempt_disable(); current->flags &= ~PF_IO_WORKER; preempt_enable(); - raw_spin_unlock(&wqe->lock); kfree_rcu(worker, rcu); io_worker_ref_put(wqe->wq); @@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker) static inline bool io_acct_run_queue(struct io_wqe_acct *acct) { + bool ret = false; + + raw_spin_lock(&acct->lock); if (!wq_list_empty(&acct->work_list) && !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - return true; - return false; + ret = true; + raw_spin_unlock(&acct->lock); + + return ret; } /* @@ -385,7 +391,6 @@ fail: } static void io_wqe_dec_running(struct io_worker *worker) - __must_hold(wqe->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; @@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker) if (!(worker->flags & IO_WORKER_F_UP)) return; - if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - raw_spin_unlock(&wqe->lock); - io_queue_worker_create(worker, acct, create_worker_cb); - raw_spin_lock(&wqe->lock); - } + if (!atomic_dec_and_test(&acct->nr_running)) + return; + if (!io_acct_run_queue(acct)) + return; + + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + io_queue_worker_create(worker, acct, create_worker_cb); } /* @@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker) * it's currently on the freelist */ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) { if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; + raw_spin_lock(&wqe->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); + raw_spin_unlock(&wqe->lock); } } @@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, struct io_worker *worker) - __must_hold(wqe->lock) + __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; @@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, * work being added and clearing the stalled bit. */ set_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&acct->lock); unstalled = io_wait_on_hash(wqe, stall_hash); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); if (unstalled) { clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); if (wq_has_sleeper(&wqe->wq->hash->wait)) @@ -538,7 +545,6 @@ static void io_assign_current_work(struct io_worker *worker, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_worker_handle_work(struct io_worker *worker) - __releases(wqe->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; @@ -555,7 +561,9 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ + raw_spin_lock(&acct->lock); work = io_get_next_work(acct, worker); + raw_spin_unlock(&acct->lock); if (work) { __io_worker_busy(wqe, worker); @@ -569,10 +577,9 @@ static void io_worker_handle_work(struct io_worker *worker) raw_spin_lock(&worker->lock); worker->next_work = work; raw_spin_unlock(&worker->lock); - } - raw_spin_unlock(&wqe->lock); - if (!work) + } else { break; + } io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -608,8 +615,6 @@ static void io_worker_handle_work(struct io_worker *worker) wake_up(&wq->hash->wait); } } while (work); - - raw_spin_lock(&wqe->lock); } while (1); } @@ -633,12 +638,10 @@ static int io_wqe_worker(void *data) long ret; set_current_state(TASK_INTERRUPTIBLE); -loop: - raw_spin_lock(&wqe->lock); - if (io_acct_run_queue(acct)) { + while (io_acct_run_queue(acct)) io_worker_handle_work(worker); - goto loop; - } + + raw_spin_lock(&wqe->lock); /* timed out, exit unless we're the last worker */ if (last_timeout && acct->nr_workers > 1) { acct->nr_workers--; @@ -662,10 +665,8 @@ loop: last_timeout = !ret; } - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock(&wqe->lock); + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) io_worker_handle_work(worker); - } audit_free(current); io_worker_exit(worker); @@ -705,10 +706,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) return; worker->flags &= ~IO_WORKER_F_RUNNING; - - raw_spin_lock(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock(&worker->wqe->lock); } static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, @@ -778,10 +776,12 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; + raw_spin_unlock(&wqe->lock); while (io_acct_cancel_pending_work(wqe, acct, &match)) - raw_spin_lock(&wqe->lock); + ; + } else { + raw_spin_unlock(&wqe->lock); } - raw_spin_unlock(&wqe->lock); io_worker_ref_put(wqe->wq); kfree(worker); return; @@ -914,6 +914,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_cb_cancel_data match; unsigned work_flags = work->flags; bool do_create; @@ -927,10 +928,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); io_wqe_insert_work(wqe, work); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); + raw_spin_unlock(&acct->lock); + raw_spin_lock(&wqe->lock); rcu_read_lock(); do_create = !io_wqe_activate_free_worker(wqe, acct); rcu_read_unlock(); @@ -946,18 +949,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; raw_spin_lock(&wqe->lock); - /* fatal condition, failed to create the first worker */ - if (!acct->nr_workers) { - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_item, - .data = work, - .cancel_all = false, - }; - - if (io_acct_cancel_pending_work(wqe, acct, &match)) - raw_spin_lock(&wqe->lock); + if (acct->nr_workers) { + raw_spin_unlock(&wqe->lock); + return; } raw_spin_unlock(&wqe->lock); + + /* fatal condition, failed to create the first worker */ + match.fn = io_wq_work_match_item, + match.data = work, + match.cancel_all = false, + + io_acct_cancel_pending_work(wqe, acct, &match); } } @@ -1032,22 +1035,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match) - __releases(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; + raw_spin_lock(&acct->lock); wq_list_for_each(node, prev, &acct->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&acct->lock); io_run_cancel(work, wqe); match->nr_pending++; /* not safe to continue after unlock */ return true; } + raw_spin_unlock(&acct->lock); return false; } @@ -1061,7 +1065,6 @@ retry: struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); if (io_acct_cancel_pending_work(wqe, acct, match)) { - raw_spin_lock(&wqe->lock); if (match->cancel_all) goto retry; break; @@ -1103,13 +1106,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) { - raw_spin_unlock(&wqe->lock); + if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - } + raw_spin_lock(&wqe->lock); io_wqe_cancel_running_work(wqe, &match); raw_spin_unlock(&wqe->lock); if (match.nr_running && !match.cancel_all) @@ -1190,6 +1191,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) acct->index = i; atomic_set(&acct->nr_running, 0); INIT_WQ_LIST(&acct->work_list); + raw_spin_lock_init(&acct->lock); } wqe->wq = wq; raw_spin_lock_init(&wqe->lock); @@ -1282,9 +1284,7 @@ static void io_wq_destroy(struct io_wq *wq) .fn = io_wq_work_match_all, .cancel_all = true, }; - raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); - raw_spin_unlock(&wqe->lock); free_cpumask_var(wqe->cpu_mask); kfree(wqe); } @@ -1376,7 +1376,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2); - for (i = 0; i < 2; i++) { + for (i = 0; i < IO_WQ_ACCT_NR; i++) { if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) new_count[i] = task_rlimit(current, RLIMIT_NPROC); } diff --git a/fs/io_uring.c b/fs/io_uring.c index 4715980e9015..5fa736344b67 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -63,6 +63,7 @@ #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> +#include <net/busy_poll.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -263,11 +264,18 @@ struct io_rsrc_data { bool quiesce; }; +struct io_buffer_list { + struct list_head list; + struct list_head buf_list; + __u16 bgid; +}; + struct io_buffer { struct list_head list; __u64 addr; __u32 len; __u16 bid; + __u16 bgid; }; struct io_restriction { @@ -326,6 +334,14 @@ struct io_submit_state { struct blk_plug plug; }; +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; +}; + +#define IO_BUFFERS_HASH_BITS 5 + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -335,11 +351,11 @@ struct io_ring_ctx { unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; - unsigned int eventfd_async: 1; unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int drain_disabled: 1; + unsigned int has_evfd: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -378,7 +394,9 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct xarray io_buffers; + struct list_head *io_buffers; + struct list_head io_buffers_cache; + struct list_head apoll_cache; struct xarray personalities; u32 pers_next; unsigned sq_thread_idle; @@ -395,11 +413,16 @@ struct io_ring_ctx { struct list_head sqd_list; unsigned long check_cq_overflow; +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + struct list_head napi_list; + spinlock_t napi_lock; /* napi_list lock */ +#endif struct { unsigned cached_cq_tail; unsigned cq_entries; - struct eventfd_ctx *cq_ev_fd; + struct io_ev_fd __rcu *io_ev_fd; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; @@ -421,6 +444,8 @@ struct io_ring_ctx { struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; + + struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; struct io_restriction restrictions; @@ -436,6 +461,8 @@ struct io_ring_ctx { struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; }; /* Keep this last, we don't need it for the fast path */ @@ -461,6 +488,11 @@ struct io_ring_ctx { }; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + struct io_uring_task { /* submission side */ int cached_refs; @@ -476,6 +508,7 @@ struct io_uring_task { struct io_wq_work_list task_list; struct io_wq_work_list prior_task_list; struct callback_head task_work; + struct file **registered_rings; bool task_running; }; @@ -690,6 +723,12 @@ struct io_hardlink { int flags; }; +struct io_msg { + struct file *file; + u64 user_data; + u32 len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -741,6 +780,8 @@ enum { REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, REQ_F_SKIP_LINK_CQES_BIT, + REQ_F_SINGLE_POLL_BIT, + REQ_F_DOUBLE_POLL_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -799,6 +840,10 @@ enum { REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* single poll may be active */ + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + /* double poll may active */ + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), }; struct async_poll { @@ -825,7 +870,7 @@ enum { * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, - * or directly as just 'ki_filp' in this struct. + * or directly as just 'file' in this struct. */ struct io_kiocb { union { @@ -855,6 +900,7 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; + struct io_msg msg; }; u8 opcode; @@ -877,6 +923,7 @@ struct io_kiocb { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; atomic_t refs; + atomic_t poll_refs; struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ @@ -885,12 +932,11 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - struct io_wq_work work; /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - atomic_t poll_refs; + const struct cred *creds; + struct io_wq_work work; }; struct io_tctx_node { @@ -1105,6 +1151,9 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MKDIRAT] = {}, [IORING_OP_SYMLINKAT] = {}, [IORING_OP_LINKAT] = {}, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -1141,6 +1190,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); +static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; @@ -1267,36 +1317,88 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } -static unsigned int __io_put_kbuf(struct io_kiocb *req) +static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { struct io_buffer *kbuf = req->kbuf; unsigned int cflags; - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; + cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); req->flags &= ~REQ_F_BUFFER_SELECTED; - kfree(kbuf); + list_add(&kbuf->list, list); req->kbuf = NULL; return cflags; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req) +static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbuf(req, &req->ctx->io_buffers_comp); +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) { + unsigned int cflags; + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbuf(req); + + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); + } + + return cflags; } -static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) +static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) { - bool got = percpu_ref_tryget(ref); + struct list_head *hash_list; + struct io_buffer_list *bl; - /* already at zero, wait for ->release() */ - if (!got) - wait_for_completion(compl); - percpu_ref_resurrect(ref); - if (got) - percpu_ref_put(ref); + hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + list_for_each_entry(bl, hash_list, list) + if (bl->bgid == bgid || bgid == -1U) + return bl; + + return NULL; +} + +static void io_kbuf_recycle(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + struct io_buffer *buf; + + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + + lockdep_assert_held(&ctx->uring_lock); + + buf = req->kbuf; + bl = io_buffer_get_list(ctx, buf->bgid); + list_add(&buf->list, &bl->buf_list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1409,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int hash_bits; + int i, hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -1436,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; + ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, + sizeof(struct list_head), GFP_KERNEL); + if (!ctx->io_buffers) + goto err; + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) + INIT_LIST_HEAD(&ctx->io_buffers[i]); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1444,14 +1553,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); + INIT_LIST_HEAD(&ctx->io_buffers_cache); + INIT_LIST_HEAD(&ctx->apoll_cache); init_completion(&ctx->ref_comp); - xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); + INIT_LIST_HEAD(&ctx->io_buffers_pages); + INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1464,10 +1576,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); +#ifdef CONFIG_NET_RX_BUSY_POLL + INIT_LIST_HEAD(&ctx->napi_list); + spin_lock_init(&ctx->napi_lock); +#endif return ctx; err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); + kfree(ctx->io_buffers); kfree(ctx); return NULL; } @@ -1610,8 +1727,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); + trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, + &req->work, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1681,22 +1798,27 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); -} - static inline void io_commit_cqring(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) - __io_commit_cqring_flush(ctx); /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); +} + static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -1726,23 +1848,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return &rings->cqes[tail & mask]; } -static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +static void io_eventfd_signal(struct io_ring_ctx *ctx) { - if (likely(!ctx->cq_ev_fd)) - return false; + struct io_ev_fd *ev_fd; + + rcu_read_lock(); + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + goto out; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return false; - return !ctx->eventfd_async || io_wq_current_is_worker(); + goto out; + + if (!ev_fd->eventfd_async || io_wq_current_is_worker()) + eventfd_signal(ev_fd->cq_ev_fd, 1); +out: + rcu_read_unlock(); } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * wake_up_all() may seem excessive, but io_wake_function() and @@ -1751,21 +1884,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); +} + +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); + + io_cqring_wake(ctx); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); - } - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_wake(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -1905,8 +2049,6 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, { struct io_uring_cqe *cqe; - trace_io_uring_complete(ctx, user_data, res, cflags); - /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -1922,16 +2064,23 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, return io_cqring_event_overflow(ctx, user_data, res, cflags); } +static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +{ + trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); + return __io_fill_cqe(req->ctx, req->user_data, res, cflags); +} + static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req->ctx, req->user_data, res, cflags); + __io_fill_cqe_req(req, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; + trace_io_uring_complete(ctx, NULL, user_data, res, cflags); return __io_fill_cqe(ctx, user_data, res, cflags); } @@ -1941,7 +2090,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, res, cflags); + __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -2000,7 +2149,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res) static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, 0); + io_req_complete_post(req, res, io_put_kbuf(req, 0)); } static void io_req_complete_fail_submit(struct io_kiocb *req) @@ -2183,7 +2332,9 @@ static void io_fail_links(struct io_kiocb *req) nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req, link); + trace_io_uring_fail_link(req->ctx, req, req->user_data, + req->opcode, link); + if (!ignore_cqes) { link->flags &= ~REQ_F_CQE_SKIP; io_fill_cqe_req(link, res, 0); @@ -2302,7 +2453,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (likely(*uring_locked)) req->io_task_work.func(req, uring_locked); else - __io_req_complete_post(req, req->result, io_put_kbuf(req)); + __io_req_complete_post(req, req->result, + io_put_kbuf_comp(req)); node = next; } while (node); @@ -2530,8 +2682,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) comp_list); if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, req->result, - req->cflags); + __io_fill_cqe_req(req, req->result, req->cflags); + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } } io_commit_cqring(ctx); @@ -2653,7 +2813,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req)); + __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); nr_events++; } @@ -2829,14 +2989,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - unsigned int cflags = io_put_kbuf(req); int res = req->result; if (*locked) { - io_req_complete_state(req, res, cflags); + io_req_complete_state(req, res, io_put_kbuf(req, 0)); io_req_add_compl_list(req); } else { - io_req_complete_post(req, res, cflags); + io_req_complete_post(req, res, + io_put_kbuf(req, IO_URING_F_UNLOCKED)); } } @@ -2845,7 +3005,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, + io_put_kbuf(req, issue_flags)); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -3000,14 +3161,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1) { - if (!(file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = file->f_pos; - } else { - kiocb->ki_pos = 0; - } - } kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); |
