diff options
| -rw-r--r-- | fs/io-wq.c | 58 | ||||
| -rw-r--r-- | fs/io-wq.h | 59 | ||||
| -rw-r--r-- | fs/io_uring.c | 1714 | ||||
| -rw-r--r-- | include/trace/events/io_uring.h | 61 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 1 |
5 files changed, 1045 insertions, 848 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c index 422a7ed6a9bd..38b33ad9e8cf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match); +static void create_worker_cb(struct callback_head *cb); static bool io_worker_get(struct io_worker *worker) { @@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq) complete(&wq->worker_done); } +static void io_worker_cancel_cb(struct io_worker *worker) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + + atomic_dec(&acct->nr_running); + raw_spin_lock(&worker->wqe->lock); + acct->nr_workers--; + raw_spin_unlock(&worker->wqe->lock); + io_worker_ref_put(wq); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); +} + +static bool io_task_worker_match(struct callback_head *cb, void *data) +{ + struct io_worker *worker; + + if (cb->func != create_worker_cb) + return false; + worker = container_of(cb, struct io_worker, create_work); + return worker == data; +} + static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; - if (refcount_dec_and_test(&worker->ref)) - complete(&worker->ref_done); + while (1) { + struct callback_head *cb = task_work_cancel_match(wq->task, + io_task_worker_match, worker); + + if (!cb) + break; + io_worker_cancel_cb(worker); + } + + io_worker_release(worker); wait_for_completion(&worker->ref_done); raw_spin_lock(&wqe->lock); @@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker, init_task_work(&worker->create_work, func); worker->create_index = acct->index; - if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) + if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { + clear_bit_unlock(0, &worker->create_state); return true; + } clear_bit_unlock(0, &worker->create_state); fail_release: io_worker_release(worker); @@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work) struct io_worker *worker = container_of(work, struct io_worker, work); struct io_wqe_acct *acct = io_wqe_get_acct(worker); - if (!io_queue_worker_create(worker, acct, create_worker_cont)) { - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); - } } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) @@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq) while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; - struct io_wqe_acct *acct; worker = container_of(cb, struct io_worker, create_work); - acct = io_wqe_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + io_worker_cancel_cb(worker); } rcu_read_lock(); diff --git a/fs/io-wq.h b/fs/io-wq.h index bf5c4c533760..41bf37674a49 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -29,6 +29,17 @@ struct io_wq_work_list { struct io_wq_work_node *last; }; +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_for_each_resume(pos, prv) \ + for (; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ +} while (0) + static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) @@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } +static inline void wq_list_add_head(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = list->first; + if (!node->next) + list->last = node; + WRITE_ONCE(list->first, node); +} + static inline void wq_list_cut(struct io_wq_work_list *list, struct io_wq_work_node *last, struct io_wq_work_node *prev) @@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list, last->next = NULL; } +static inline void __wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + list->last->next = to->next; + to->next = list->first; + INIT_WQ_LIST(list); +} + +static inline bool wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + if (!wq_list_empty(list)) { + __wq_list_splice(list, to); + return true; + } + return false; +} + +static inline void wq_stack_add_head(struct io_wq_work_node *node, + struct io_wq_work_node *stack) +{ + node->next = stack->next; + stack->next = node; +} + static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) @@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list, wq_list_cut(list, node, prev); } -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) +static inline +struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) +{ + struct io_wq_work_node *node = stack->next; -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ - (list)->last = NULL; \ -} while (0) + stack->next = node->next; + return node; +} struct io_wq_work { struct io_wq_work_node list; diff --git a/fs/io_uring.c b/fs/io_uring.c index 057d07cee9f8..ca10dbb01201 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -103,11 +103,14 @@ #define IORING_MAX_REG_BUFFERS (1U << 14) -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) +#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC) + +#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) + #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ + REQ_F_ASYNC_DATA) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -195,8 +198,10 @@ struct io_rings { }; enum io_uring_cmd_flags { - IO_URING_F_NONBLOCK = 1, - IO_URING_F_COMPLETE_DEFER = 2, + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, }; struct io_mapped_ubuf { @@ -305,26 +310,16 @@ struct io_submit_link { }; struct io_submit_state { - struct blk_plug plug; + /* inline/task_work completion list, under ->uring_lock */ + struct io_wq_work_node free_list; + /* batch completion logic */ + struct io_wq_work_list compl_reqs; struct io_submit_link link; - /* - * io_kiocb alloc cache - */ - void *reqs[IO_REQ_CACHE_SIZE]; - unsigned int free_reqs; - bool plug_started; - - /* - * Batch completion logic - */ - struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; - unsigned int compl_nr; - /* inline/task_work completion list, under ->uring_lock */ - struct list_head free_list; - - unsigned int ios_left; + bool need_plug; + unsigned short submit_nr; + struct blk_plug plug; }; struct io_ring_ctx { @@ -368,6 +363,7 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; + int rsrc_cached_refs; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; @@ -384,7 +380,7 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ - struct list_head locked_free_list; + struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; const struct cred *sq_creds; /* cred used for __io_sq_thread() */ @@ -399,7 +395,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct eventfd_ctx *cq_ev_fd; - struct wait_queue_head poll_wait; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; @@ -417,7 +412,7 @@ struct io_ring_ctx { * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head iopoll_list; + struct io_wq_work_list iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; @@ -580,7 +575,6 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; - struct io_buffer *kbuf; }; struct io_open { @@ -692,11 +686,6 @@ struct io_hardlink { int flags; }; -struct io_completion { - struct file *file; - u32 cflags; -}; - struct io_async_connect { struct sockaddr_storage address; }; @@ -710,11 +699,15 @@ struct io_async_msghdr { struct sockaddr_storage addr; }; -struct io_async_rw { - struct iovec fast_iov[UIO_FASTIOV]; - const struct iovec *free_iovec; +struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; + struct iovec fast_iov[UIO_FASTIOV]; +}; + +struct io_async_rw { + struct io_rw_state s; + const struct iovec *free_iovec; size_t bytes_done; struct wait_page_queue wpq; }; @@ -741,9 +734,9 @@ enum { REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, + REQ_F_ASYNC_DATA_BIT, /* keep async read/write and isreg together and in order */ - REQ_F_NOWAIT_READ_BIT, - REQ_F_NOWAIT_WRITE_BIT, + REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ @@ -784,10 +777,8 @@ enum { REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* supports async reads */ - REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), - /* supports async writes */ - REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), + /* supports async reads/writes */ + REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ @@ -796,6 +787,8 @@ enum { REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + /* ->async_data allocated */ + REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), }; struct async_poll { @@ -852,39 +845,41 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; - /* use only after cleaning per-op data, see io_clean_op() */ - struct io_completion compl; }; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; - u16 buf_index; + unsigned int flags; + + u64 user_data; u32 result; + u32 cflags; struct io_ring_ctx *ctx; - unsigned int flags; - atomic_t refs; struct task_struct *task; - u64 user_data; - struct io_kiocb *link; struct percpu_ref *fixed_rsrc_refs; + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; - /* used with ctx->iopoll_list with reads/writes */ - struct list_head inflight_entry; + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + atomic_t refs; + struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; + /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; struct io_wq_work work; + /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; - - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; }; struct io_tctx_node { @@ -902,12 +897,12 @@ struct io_defer_entry { struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; + /* should block plug */ + unsigned plug : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ unsigned unbound_nonreg_file : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; @@ -915,8 +910,8 @@ struct io_op_def { unsigned buffer_select : 1; /* do prep async if is going to be punted */ unsigned needs_async_setup : 1; - /* should block plug */ - unsigned plug : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; /* size of async data needed, if any */ unsigned short async_size; }; @@ -1080,7 +1075,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags); + s32 res, u32 cflags); static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1095,7 +1090,7 @@ static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); -static void io_submit_flush_completions(struct io_ring_ctx *ctx); +static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, @@ -1167,6 +1162,12 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { @@ -1180,13 +1181,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req) __io_req_set_refcount(req, 1); } -static inline void io_req_set_rsrc_node(struct io_kiocb *req) +#define IO_RSRC_REF_BATCH 100 + +static inline void io_req_put_rsrc_locked(struct io_kiocb *req, + struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { - struct io_ring_ctx *ctx = req->ctx; + struct percpu_ref *ref = req->fixed_rsrc_refs; + if (ref) { + if (ref == &ctx->rsrc_node->refs) + ctx->rsrc_cached_refs++; + else + percpu_ref_put(ref); + } +} + +static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + if (req->fixed_rsrc_refs) + percpu_ref_put(req->fixed_rsrc_refs); +} + +static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (ctx->rsrc_cached_refs) { + percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + ctx->rsrc_cached_refs = 0; + } +} + +static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; + percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); +} + +static inline void io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx) +{ if (!req->fixed_rsrc_refs) { req->fixed_rsrc_refs = &ctx->rsrc_node->refs; - percpu_ref_get(req->fixed_rsrc_refs); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); } } @@ -1219,6 +1259,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, return false; } +static inline bool req_has_async_data(struct io_kiocb *req) +{ + return req->flags & REQ_F_ASYNC_DATA; +} + static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -1230,7 +1275,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) req->result = res; } -static void io_ring_ctx_ref_free(struct percpu_ref *ref) +static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1242,7 +1287,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req) return !req->timeout.off; } -static void io_fallback_req_func(struct work_struct *work) +static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); @@ -1255,15 +1300,13 @@ static void io_fallback_req_func(struct work_struct *work) req->io_task_work.func(req, &locked); if (locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } percpu_ref_put(&ctx->refs); - } -static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; @@ -1300,7 +1343,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); - init_waitqueue_head(&ctx->poll_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); @@ -1309,7 +1351,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); - INIT_LIST_HEAD(&ctx->iopoll_list); + INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1318,9 +1360,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); - INIT_LIST_HEAD(&ctx->submit_state.free_list); - INIT_LIST_HEAD(&ctx->locked_free_list); + ctx->submit_state.free_list.next = NULL; + INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); + INIT_WQ_LIST(&ctx->submit_state.compl_reqs); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1348,21 +1391,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_ASYNC_READ 0x1UL -#define FFS_ASYNC_WRITE 0x2UL -#ifdef CONFIG_64BIT -#define FFS_ISREG 0x4UL -#else -#define FFS_ISREG 0x0UL -#endif -#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) static inline bool io_req_ffs_set(struct io_kiocb *req) { - return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); + return req->flags & REQ_F_FIXED_FILE; } -static void io_req_track_inflight(struct io_kiocb *req) +static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; @@ -1440,15 +1478,19 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static void io_queue_async_work(struct io_kiocb *req, bool *locked) +static inline void io_req_add_compl_list(struct io_kiocb *req) +{ + struct io_submit_state *state = &req->ctx->submit_state; + + wq_list_add_tail(&req->comp_list, &state->compl_reqs); +} + +static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; - /* must not take the lock, NULL it as a precaution */ - locked = NULL; - BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -1489,7 +1531,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -static void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, @@ -1503,7 +1545,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) } } -static void io_flush_timeouts(struct io_ring_ctx *ctx) +static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -1536,7 +1578,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used) io_flush_timeouts(ctx); @@ -1606,12 +1648,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) - wake_up_interruptible(&ctx->poll_wait); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1625,8 +1663,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) - wake_up_interruptible(&ctx->poll_wait); } /* Returns true if there are no backlogged entries after the flush */ @@ -1722,7 +1758,7 @@ static inline void io_get_task_refs(int nr) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_overflow_cqe *ocqe; @@ -1750,7 +1786,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, } static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -1773,13 +1809,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data /* not as hot to bloat with inlining */ static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { return __io_cqring_fill_event(ctx, user_data, res, cflags); } -static void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -1798,40 +1834,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res, req->link = NULL; } } + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; - } else { - if (!percpu_ref_tryget(&ctx->refs)) - req = NULL; } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - - if (req) { - io_cqring_ev_posted(ctx); - percpu_ref_put(&ctx->refs); - } -} - -static inline bool io_req_needs_clean(struct io_kiocb *req) -{ - return req->flags & IO_REQ_CLEAN_FLAGS; + io_cqring_ev_posted(ctx); } -static void io_req_complete_state(struct io_kiocb *req, long res, - unsigned int cflags) +static inline void io_req_complete_state(struct io_kiocb *req, s32 res, + u32 cflags) { - if (io_req_needs_clean(req)) - io_clean_op(req); req->result = res; - req->compl.cflags = cflags; + req->cflags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - long res, unsigned cflags) + s32 res, u32 cflags) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_state(req, res, cflags); @@ -1839,12 +1862,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } -static inline void io_req_complete(struct io_kiocb *req, long res) +static inline void io_req_complete(struct io_kiocb *req, s32 res) { __io_req_complete(req, 0, res, 0); } -static void io_req_complete_failed(struct io_kiocb *req, long res) +static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); io_req_complete_post(req, res, 0); @@ -1878,7 +1901,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_submit_state *state) { spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->locked_free_list, &state->free_list); + wq_list_splice(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; spin_unlock(&ctx->completion_lock); } @@ -1887,7 +1910,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - int nr; /* * If we have more than a batch's worth of requests in our IRQ side @@ -1896,20 +1918,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) */ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) io_flush_cached_locked_reqs(ctx, state); - - nr = state->free_reqs; - while (!list_empty(&state->free_list)) { - struct io_kiocb *req = list_first_entry(&state->free_list, - struct io_kiocb, inflight_entry); - - list_del(&req->inflight_entry); - state->reqs[nr++] = req; - if (nr == ARRAY_SIZE(state->reqs)) - break; - } - - state->free_reqs = nr; - return nr != 0; + return !!state->free_list.next; } /* @@ -1918,38 +1927,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ -static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + void *reqs[IO_REQ_ALLOC_BATCH]; + struct io_kiocb *req; int ret, i; - BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); - - if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) - goto got_req; + if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) + return true; - ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, - state->reqs); + ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, * retry single alloc to be on the safe side. */ if (unlikely(ret <= 0)) { - state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!state->reqs[0]) - return NULL; + reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!reqs[0]) + return false; ret = 1; } - for (i = 0; i < ret; i++) - io_preinit_req(state->reqs[i], ctx); - state->free_reqs = ret; -got_req: - state->free_reqs--; - return state->reqs[state->free_reqs]; + percpu_ref_get_many(&ctx->refs, ret); + for (i = 0; i < ret; i++) { + req = reqs[i]; + + io_preinit_req(req, ctx); + wq_stack_add_head(&req->comp_list, &state->free_list); + } + return true; +} + +static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +{ + if (unlikely(!ctx->submit_state.free_list.next)) + return __io_alloc_req_refill(ctx); + return true; +} + +static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +{ + struct io_wq_work_node *node; + + node = wq_stack_extract(&ctx->submit_state.free_list); + return container_of(node, struct io_kiocb, comp_list); } static inline void io_put_file(struct file *file) @@ -1958,35 +1983,28 @@ static inline void io_put_file(struct file *file) fput(file); } -static void io_dismantle_req(struct io_kiocb *req) +static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; - if (io_req_needs_clean(req)) + if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); if (!(flags & REQ_F_FIXED_FILE)) io_put_file(req->file); - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); - if (req->async_data) { - kfree(req->async_data); - req->async_data = NULL; - } } -static void __io_free_req(struct io_kiocb *req) +static __cold void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); spin_lock(&ctx->completion_lock); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; spin_unlock(&ctx->completion_lock); - - percpu_ref_put(&ctx->refs); } static inline void io_remove_next_linked(struct io_kiocb *req) @@ -2072,47 +2090,45 @@ static bool io_disarm_next(struct io_kiocb *req) return posted; } -static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) +static void __io_req_find_next_prep(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + spin_lock(&ctx->completion_lock); + posted = io_disarm_next(req); + if (posted) + io_commit_cqring(req->ctx); + spin_unlock(&ctx->completion_lock); + if (posted) + io_cqring_ev_posted(ctx); +} + +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; + if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) + return NULL; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & IO_DISARM_MASK) { - struct io_ring_ctx *ctx = req->ctx; - bool posted; - - spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); - if (posted) - io_commit_cqring(req->ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); - } + if (unlikely(req->flags & IO_DISARM_MASK)) + __io_req_find_next_prep(req); nxt = req->link; req->link = NULL; return nxt; } -static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -{ - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) - return NULL; - return __io_req_find_next(req); -} - static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; if (*locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); *locked = false; } @@ -2129,7 +2145,7 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node; - if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) + if (!tctx->task_list.first && locked) io_submit_flush_completions(ctx); spin_lock_irq(&tctx->task_lock); @@ -2192,8 +2208,9 @@ static void io_req_task_work_add(struct io_kiocb *req) * will do the job. */ notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (!task_work_add(tsk, &tctx->task_work, notify)) { - wake_up_process(tsk); + if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { + if (notify == TWA_NONE) + wake_up_process(tsk); return; } @@ -2271,77 +2288,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked) io_free_req(req); } -struct req_batch { - struct task_struct *task; - int task_refs; - int ctx_refs; -}; - -static inline void io_init_req_batch(struct req_batch *rb) +static void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) + __must_hold(&ctx->uring_lock) { - rb->task_refs = 0; - rb->ctx_refs = 0; - rb->task = NULL; -} + struct task_struct |
