diff options
Diffstat (limited to 'io_uring')
35 files changed, 1898 insertions, 1698 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index 2e1d4e03799c..fc1b23c524e8 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,13 +2,14 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o filetable.o \ - openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o truncate.o +obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ + tctx.o filetable.o rw.o net.o poll.o \ + uring_cmd.o openclose.o sqpoll.o \ + xattr.o nop.o fs.o splice.o sync.o \ + msg_ring.o advise.o openclose.o \ + epoll.o statx.o timeout.o fdinfo.o \ + cancel.o waitid.o register.o \ + truncate.o memmap.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index bf2fb26a6539..b7a38a2069cf 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -4,63 +4,58 @@ /* * Don't allow the cache to grow beyond this size. */ -#define IO_ALLOC_CACHE_MAX 512 - -struct io_cache_entry { - struct io_wq_work_node node; -}; +#define IO_ALLOC_CACHE_MAX 128 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, - struct io_cache_entry *entry) + void *entry) { if (cache->nr_cached < cache->max_cached) { - cache->nr_cached++; - wq_stack_add_head(&entry->node, &cache->list); - kasan_mempool_poison_object(entry); + if (!kasan_mempool_poison_object(entry)) + return false; + cache->entries[cache->nr_cached++] = entry; return true; } return false; } -static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) -{ - return !cache->list.next; -} - -static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) +static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (cache->list.next) { - struct io_cache_entry *entry; + if (cache->nr_cached) { + void *entry = cache->entries[--cache->nr_cached]; - entry = container_of(cache->list.next, struct io_cache_entry, node); kasan_mempool_unpoison_object(entry, cache->elem_size); - cache->list.next = cache->list.next->next; - cache->nr_cached--; return entry; } return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache, +/* returns false if the cache was initialized properly */ +static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, size_t size) { - cache->list.next = NULL; - cache->nr_cached = 0; - cache->max_cached = max_nr; - cache->elem_size = size; + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); + if (cache->entries) { + cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; + return false; + } + return true; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, - void (*free)(struct io_cache_entry *)) + void (*free)(const void *)) { - while (1) { - struct io_cache_entry *entry = io_alloc_cache_get(cache); + void *entry; + + if (!cache->entries) + return; - if (!entry) - break; + while ((entry = io_alloc_cache_get(cache)) != NULL) free(entry); - } - cache->nr_cached = 0; + + kvfree(cache->entries); + cache->entries = NULL; } #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index acfcdd7f059a..a6e58a20efdd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -184,9 +184,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); + ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 8d444dd1b0a7..b1e0e0d85349 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -50,9 +50,9 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, * Caller holds a reference to the file already, we don't need to do * anything else to get an extra reference. */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) { - struct io_ring_ctx *ctx = f->private_data; + struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 6e86e6188dbe..997c56d32ee6 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -84,12 +84,12 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, return ret; file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); + } else { + io_file_bitmap_set(&ctx->file_table, slot_index); } *io_get_tag_slot(ctx->file_data, slot_index) = 0; io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); return 0; } diff --git a/io_uring/futex.c b/io_uring/futex.c index 792a03df58de..914848f46beb 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -9,7 +9,7 @@ #include "../kernel/futex/futex.h" #include "io_uring.h" -#include "rsrc.h" +#include "alloc_cache.h" #include "futex.h" struct io_futex { @@ -27,27 +27,21 @@ struct io_futex { }; struct io_futex_data { - union { - struct futex_q q; - struct io_cache_entry cache; - }; + struct futex_q q; struct io_kiocb *req; }; -void io_futex_cache_init(struct io_ring_ctx *ctx) -{ - io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_futex_data)); -} +#define IO_FUTEX_ALLOC_CACHE_MAX 32 -static void io_futex_cache_entry_free(struct io_cache_entry *entry) +bool io_futex_cache_init(struct io_ring_ctx *ctx) { - kfree(container_of(entry, struct io_futex_data, cache)); + return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, + sizeof(struct io_futex_data)); } void io_futex_cache_free(struct io_ring_ctx *ctx) { - io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); + io_alloc_cache_free(&ctx->futex_cache, kfree); } static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) @@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) + if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) kfree(ifd); __io_futex_complete(req, ts); } @@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) { - struct io_cache_entry *entry; + struct io_futex_data *ifd; - entry = io_alloc_cache_get(&ctx->futex_cache); - if (entry) - return container_of(entry, struct io_futex_data, cache); + ifd = io_alloc_cache_get(&ctx->futex_cache); + if (ifd) + return ifd; return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); } diff --git a/io_uring/futex.h b/io_uring/futex.h index 0847e9e8a127..b8bb09873d57 100644 --- a/io_uring/futex.h +++ b/io_uring/futex.h @@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -void io_futex_cache_init(struct io_ring_ctx *ctx); +bool io_futex_cache_init(struct io_ring_ctx *ctx); void io_futex_cache_free(struct io_ring_ctx *ctx); #else static inline int io_futex_cancel(struct io_ring_ctx *ctx, @@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx, { return false; } -static inline void io_futex_cache_init(struct io_ring_ctx *ctx) +static inline bool io_futex_cache_init(struct io_ring_ctx *ctx) { + return false; } static inline void io_futex_cache_free(struct io_ring_ctx *ctx) { diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 522196dfb0ff..d1c47a9d9215 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -25,10 +25,10 @@ #define WORKER_IDLE_TIMEOUT (5 * HZ) enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ + IO_WORKER_F_UP = 0, /* up and active */ + IO_WORKER_F_RUNNING = 1, /* account as running */ + IO_WORKER_F_FREE = 2, /* worker on free list */ + IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -44,21 +44,20 @@ enum { */ struct io_worker { refcount_t ref; - unsigned flags; + int create_index; + unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; struct io_wq_work *cur_work; - struct io_wq_work *next_work; raw_spinlock_t lock; struct completion ref_done; unsigned long create_state; struct callback_head create_work; - int create_index; union { struct rcu_head rcu; @@ -165,7 +164,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); } static void io_worker_ref_put(struct io_wq *wq) @@ -225,7 +224,7 @@ static void io_worker_exit(struct io_worker *worker) wait_for_completion(&worker->ref_done); raw_spin_lock(&wq->lock); - if (worker->flags & IO_WORKER_F_FREE) + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); raw_spin_unlock(&wq->lock); @@ -410,7 +409,7 @@ static void io_wq_dec_running(struct io_worker *worker) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; if (!atomic_dec_and_test(&acct->nr_running)) @@ -430,8 +429,8 @@ static void io_wq_dec_running(struct io_worker *worker) */ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { + clear_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); raw_spin_unlock(&wq->lock); @@ -444,8 +443,8 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) __must_hold(wq->lock) { - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; + if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { + set_bit(IO_WORKER_F_FREE, &worker->flags); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } @@ -539,7 +538,6 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_lock(&worker->lock); worker->cur_work = work; - worker->next_work = NULL; raw_spin_unlock(&worker->lock); } @@ -564,10 +562,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * clear the stalled flag. */ work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wq, worker); - /* * Make sure cancelation can find this, even before * it becomes the active work. That avoids a window @@ -576,11 +571,17 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * current work item for this worker. */ raw_spin_lock(&worker->lock); - worker->next_work = work; + worker->cur_work = work; raw_spin_unlock(&worker->lock); - } else { - break; } + + raw_spin_unlock(&acct->lock); + + if (!work) + break; + + __io_worker_busy(wq, worker); + io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -631,7 +632,8 @@ static int io_wq_worker(void *data) bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + set_mask_bits(&worker->flags, 0, + BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); @@ -695,11 +697,11 @@ void io_wq_worker_running(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (worker->flags & IO_WORKER_F_RUNNING) + if (test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags |= IO_WORKER_F_RUNNING; + set_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_inc_running(worker); } @@ -713,12 +715,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) + if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags &= ~IO_WORKER_F_RUNNING; + clear_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_dec_running(worker); } @@ -732,7 +734,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, raw_spin_lock(&wq->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); list_add_tail_rcu(&worker->all_list, &wq->all_list); - worker->flags |= IO_WORKER_F_FREE; + set_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -838,7 +840,7 @@ fail: init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; + set_bit(IO_WORKER_F_BOUND, &worker->flags); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { @@ -924,8 +926,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); + unsigned long work_flags = work->flags; struct io_cb_cancel_data match; - unsigned work_flags = work->flags; bool do_create; /* @@ -1005,8 +1007,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * may dereference the passed in work. */ raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) + if (__io_wq_worker_cancel(worker, match, worker->cur_work)) match->nr_running++; raw_spin_unlock(&worker->lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 380b9ce1d301..86fd72f6a1c2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -63,7 +63,6 @@ #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> -#include <linux/highmem.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> @@ -95,6 +94,8 @@ #include "waitid.h" #include "futex.h" #include "napi.h" +#include "uring_cmd.h" +#include "memmap.h" #include "timeout.h" #include "poll.h" @@ -170,17 +171,9 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {}, }; #endif -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) -{ - if (!wq_list_empty(&ctx->submit_state.compl_reqs) || - ctx->submit_state.cqes_count) - __io_submit_flush_completions(ctx); -} - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -253,14 +246,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - struct io_tw_state ts = { .locked = true, }; + struct io_tw_state ts = {}; percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &ts); - if (WARN_ON_ONCE(!ts.locked)) - return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -284,6 +275,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; + bool ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -312,14 +304,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_HLIST_HEAD(&ctx->io_buf_list); - io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); - io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll)); - io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); - io_futex_cache_init(ctx); + ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_rw)); + ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct uring_cache)); + ret |= io_futex_cache_init(ctx); + if (ret) + goto err; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -337,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); @@ -349,6 +345,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; err: + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); xa_destroy(&ctx->io_bl_xa); @@ -379,7 +381,7 @@ static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); + io_kbuf_drop(req); spin_unlock(&req->ctx->completion_lock); } @@ -498,7 +500,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) +static void io_queue_iowq(struct io_kiocb *req) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -666,28 +668,14 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) io_commit_cqring_flush(ctx); } -static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) -{ - struct io_overflow_cqe *ocqe; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->cq_overflow_list, &list); - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - spin_unlock(&ctx->completion_lock); - - while (!list_empty(&list)) { - ocqe = list_first_entry(&list, struct io_overflow_cqe, list); - list_del(&ocqe->list); - kfree(ocqe); - } -} - -static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { size_t cqe_size = sizeof(struct io_uring_cqe); - if (__io_cqring_events(ctx) == ctx->cq_entries) + lockdep_assert_held(&ctx->uring_lock); + + /* don't abort if we're dying, entries must get freed */ + if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; if (ctx->flags & IORING_SETUP_CQE32) @@ -698,11 +686,14 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; - if (!io_get_cqe_overflow(ctx, &cqe, true)) - break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); - memcpy(cqe, &ocqe->cqe, cqe_size); + + if (!dying) { + if (!io_get_cqe_overflow(ctx, &cqe, true)) + break; + memcpy(cqe, &ocqe->cqe, cqe_size); + } list_del(&ocqe->list); kfree(ocqe); } @@ -714,20 +705,17 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); + if (ctx->rings) + __io_cqring_overflow_flush(ctx, true); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - io_cqring_do_overflow_flush(ctx); + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx, false); + mutex_unlock(&ctx->uring_lock); } /* can be called by any task */ @@ -817,7 +805,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -void io_req_cqe_overflow(struct io_kiocb *req) +static void io_req_cqe_overflow(struct io_kiocb *req) { io_cqring_event_overflow(req->ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags, @@ -890,151 +878,71 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static void __io_flush_post_cqes(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_submit_state *state = &ctx->submit_state; - unsigned int i; - - lockdep_assert_held(&ctx->uring_lock); - for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; - - if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - spin_unlock(&ctx->completion_lock); - } else { - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - } - } - } - state->cqes_count = 0; -} - -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled && allow_overflow) + if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); io_cq_unlock_post(ctx); return filled; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) -{ - return __io_post_aux_cqe(ctx, user_data, res, cflags, true); -} - /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; - u64 user_data = req->cqe.user_data; - struct io_uring_cqe *cqe; + bool posted; lockdep_assert(!io_wq_current_is_worker()); - - if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, false); - lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { - __io_cq_lock(ctx); - __io_flush_post_cqes(ctx); - /* no need to flush - flush is deferred */ - __io_cq_unlock_post(ctx); - } - - /* For defered completions this is not as strict as it is otherwise, - * however it's main job is to prevent unbounded posted completions, - * and in that it works just as well. - */ - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - return false; - - cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; - cqe->user_data = user_data; - cqe->res = res; - cqe->flags = cflags; - return true; + __io_cq_lock(ctx); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + ctx->submit_state.cq_flush = true; + __io_cq_unlock_post(ctx); + return posted; } -static void __io_req_complete_post(struct io_kiocb *req, unsigned |
