diff options
Diffstat (limited to 'io_uring/io_uring.c')
| -rw-r--r-- | io_uring/io_uring.c | 3980 |
1 files changed, 3980 insertions, 0 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c new file mode 100644 index 000000000000..4b3fd645d023 --- /dev/null +++ b/io_uring/io_uring.c @@ -0,0 +1,3980 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared application/kernel submission and completion ring pairs, for + * supporting fast/efficient IO. + * + * A note on the read/write ordering memory barriers that are matched between + * the application and kernel side. + * + * After the application reads the CQ ring tail, it must use an + * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses + * before writing the tail (using smp_load_acquire to read the tail will + * do). It also needs a smp_mb() before updating CQ head (ordering the + * entry load(s) with the head store), pairing with an implicit barrier + * through a control-dependency in io_get_cqe (smp_store_release to + * store head will do). Failure to do so could lead to reading invalid + * CQ entries. + * + * Likewise, the application must use an appropriate smp_wmb() before + * writing the SQ tail (ordering SQ entry stores with the tail store), + * which pairs with smp_load_acquire in io_get_sqring (smp_store_release + * to store the tail will do). And it needs a barrier ordering the SQ + * head load before writing new SQ entries (smp_load_acquire to read + * head will do). + * + * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application + * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* + * updating the SQ tail; a full memory barrier smp_mb() is needed + * between. + * + * Also see the examples in the liburing library: + * + * git://git.kernel.dk/liburing + * + * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens + * from data shared between the kernel and application. This is done both + * for ordering purposes, but also to ensure that once a value is loaded from + * data that the application could potentially modify, it remains stable. + * + * Copyright (C) 2018-2019 Jens Axboe + * Copyright (c) 2018-2019 Christoph Hellwig + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <net/compat.h> +#include <linux/refcount.h> +#include <linux/uio.h> +#include <linux/bits.h> + +#include <linux/sched/signal.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/bvec.h> +#include <linux/net.h> +#include <net/sock.h> +#include <net/af_unix.h> +#include <net/scm.h> +#include <linux/anon_inodes.h> +#include <linux/sched/mm.h> +#include <linux/uaccess.h> +#include <linux/nospec.h> +#include <linux/highmem.h> +#include <linux/fsnotify.h> +#include <linux/fadvise.h> +#include <linux/task_work.h> +#include <linux/io_uring.h> +#include <linux/audit.h> +#include <linux/security.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/io_uring.h> + +#include <uapi/linux/io_uring.h> + +#include "io-wq.h" + +#include "io_uring.h" +#include "opdef.h" +#include "refs.h" +#include "tctx.h" +#include "sqpoll.h" +#include "fdinfo.h" +#include "kbuf.h" +#include "rsrc.h" +#include "cancel.h" +#include "net.h" + +#include "timeout.h" +#include "poll.h" +#include "alloc_cache.h" + +#define IORING_MAX_ENTRIES 32768 +#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) + +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) + +#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC) + +#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ + IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) + +#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ + REQ_F_ASYNC_DATA) + +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ + IO_REQ_CLEAN_FLAGS) + +#define IO_TCTX_REFS_CACHE_NR (1U << 10) + +#define IO_COMPL_BATCH 32 +#define IO_REQ_ALLOC_BATCH 8 + +enum { + IO_CHECK_CQ_OVERFLOW_BIT, + IO_CHECK_CQ_DROPPED_BIT, +}; + +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; + u32 seq; +}; + +/* requests with any of those set should undergo io_disarm_next() */ +#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) + +static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + bool cancel_all); + +static void io_dismantle_req(struct io_kiocb *req); +static void io_clean_op(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req); + +static void __io_submit_flush_completions(struct io_ring_ctx *ctx); + +static struct kmem_cache *req_cachep; + +struct sock *io_uring_get_socket(struct file *file) +{ +#if defined(CONFIG_UNIX) + if (io_is_uring_fops(file)) { + struct io_ring_ctx *ctx = file->private_data; + + return ctx->ring_sock->sk; + } +#endif + return NULL; +} +EXPORT_SYMBOL(io_uring_get_socket); + +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); +} + +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* + * As io_match_task() but protected against racing with linked timeouts. + * User must not hold timeout_lock. + */ +bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) +{ + bool matched; + + if (task && head->task != task) + return false; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; +} + +static inline void req_fail_link_node(struct io_kiocb *req, int res) +{ + req_set_fail(req); + io_req_set_res(req, res, 0); +} + +static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); +} + +static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) +{ + struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); + + complete(&ctx->ref_comp); +} + +static __cold void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + bool locked = false; + + percpu_ref_get(&ctx->refs); + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) + req->io_task_work.func(req, &locked); + + if (locked) { + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + } + percpu_ref_put(&ctx->refs); +} + +static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) +{ + unsigned hash_buckets = 1U << bits; + size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + + table->hbs = kmalloc(hash_size, GFP_KERNEL); + if (!table->hbs) + return -ENOMEM; + + table->hash_bits = bits; + init_hash_table(table, hash_buckets); + return 0; +} + +static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx; + int hash_bits; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + xa_init(&ctx->io_bl_xa); + + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread, but + * don't keep too many buckets to not overconsume memory. + */ + hash_bits = ilog2(p->cq_entries) - 5; + hash_bits = clamp(hash_bits, 1, 8); + if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) + goto err; + if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) + goto err; + + ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); + if (!ctx->dummy_ubuf) + goto err; + /* set invalid range, so io_import_fixed() fails meeting it */ + ctx->dummy_ubuf->ubuf = -1UL; + + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, + 0, GFP_KERNEL)) + goto err; + + ctx->flags = p->flags; + init_waitqueue_head(&ctx->sqo_sq_wait); + INIT_LIST_HEAD(&ctx->sqd_list); + INIT_LIST_HEAD(&ctx->cq_overflow_list); + INIT_LIST_HEAD(&ctx->io_buffers_cache); + io_alloc_cache_init(&ctx->apoll_cache); + io_alloc_cache_init(&ctx->netmsg_cache); + init_completion(&ctx->ref_comp); + xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); + mutex_init(&ctx->uring_lock); + init_waitqueue_head(&ctx->cq_wait); + spin_lock_init(&ctx->completion_lock); + spin_lock_init(&ctx->timeout_lock); + INIT_WQ_LIST(&ctx->iopoll_list); + INIT_LIST_HEAD(&ctx->io_buffers_pages); + INIT_LIST_HEAD(&ctx->io_buffers_comp); + INIT_LIST_HEAD(&ctx->defer_list); + INIT_LIST_HEAD(&ctx->timeout_list); + INIT_LIST_HEAD(&ctx->ltimeout_list); + spin_lock_init(&ctx->rsrc_ref_lock); + INIT_LIST_HEAD(&ctx->rsrc_ref_list); + INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); + init_llist_head(&ctx->rsrc_put_llist); + INIT_LIST_HEAD(&ctx->tctx_list); + ctx->submit_state.free_list.next = NULL; + INIT_WQ_LIST(&ctx->locked_free_list); + INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); + INIT_WQ_LIST(&ctx->submit_state.compl_reqs); + return ctx; +err: + kfree(ctx->dummy_ubuf); + kfree(ctx->cancel_table.hbs); + kfree(ctx->cancel_table_locked.hbs); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); + kfree(ctx); + return NULL; +} + +static void io_account_cq_overflow(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); + ctx->cq_extra--; +} + +static bool req_need_defer(struct io_kiocb *req, u32 seq) +{ + if (unlikely(req->flags & REQ_F_IO_DRAIN)) { + struct io_ring_ctx *ctx = req->ctx; + + return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; + } + + return false; +} + +static inline void io_req_track_inflight(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_INFLIGHT)) { + req->flags |= REQ_F_INFLIGHT; + atomic_inc(&req->task->io_uring->inflight_tracked); + } +} + +static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!req->link)) + return NULL; + + req->flags &= ~REQ_F_ARM_LTIMEOUT; + req->flags |= REQ_F_LINK_TIMEOUT; + + /* linked timeouts should have two refs once prep'ed */ + io_req_set_refcount(req); + __io_req_set_refcount(req->link, 2); + return req->link; +} + +static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) + return NULL; + return __io_prep_linked_timeout(req); +} + +static noinline void __io_arm_ltimeout(struct io_kiocb *req) +{ + io_queue_linked_timeout(__io_prep_linked_timeout(req)); +} + +static inline void io_arm_ltimeout(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) + __io_arm_ltimeout(req); +} + +static void io_prep_async_work(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_CREDS)) { + req->flags |= REQ_F_CREDS; + req->creds = get_current_cred(); + } + + req->work.list.next = NULL; + req->work.flags = 0; + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); + if (req->flags & REQ_F_FORCE_ASYNC) + req->work.flags |= IO_WQ_WORK_CONCURRENT; + + if (req->file && !io_req_ffs_set(req)) + req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; + + if (req->flags & REQ_F_ISREG) { + if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) + io_wq_hash_work(&req->work, file_inode(req->file)); + } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { + if (def->unbound_nonreg_file) + req->work.flags |= IO_WQ_WORK_UNBOUND; + } +} + +static void io_prep_async_link(struct io_kiocb *req) +{ + struct io_kiocb *cur; + + if (req->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); + io_for_each_link(cur, req) + io_prep_async_work(cur); + spin_unlock_irq(&ctx->timeout_lock); + } else { + io_for_each_link(cur, req) + io_prep_async_work(cur); + } +} + +void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +{ + struct io_kiocb *link = io_prep_linked_timeout(req); + struct io_uring_task *tctx = req->task->io_uring; + + BUG_ON(!tctx); + BUG_ON(!tctx->io_wq); + + /* init ->work of the whole link before punting */ + io_prep_async_link(req); + + /* + * Not expected to happen, but if we do have a bug where this _can_ + * happen, catch it here and ensure the request is marked as + * canceled. That will make io-wq go through the usual work cancel + * procedure rather than attempt to run this request (or create a new + * worker for it). + */ + if (WARN_ON_ONCE(!same_thread_group(req->task, current))) + req->work.flags |= IO_WQ_WORK_CANCEL; + + trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); + io_wq_enqueue(tctx->io_wq, &req->work); + if (link) + io_queue_linked_timeout(link); +} + +static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +{ + while (!list_empty(&ctx->defer_list)) { + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); + + if (req_need_defer(de->req, de->seq)) + break; + list_del_init(&de->list); + io_req_task_queue(de->req); + kfree(de); + } +} + +static void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + bool skip; + + spin_lock(&ctx->completion_lock); + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count only + * changing IFF a new CQE has been added to the CQ ring. There's no + * depedency on 1:1 relationship between how many times this function is + * called (and hence the eventfd count) and number of CQEs posted to the + * CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; + + rcu_read_lock(); + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + goto out; + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + goto out; + + if (!ev_fd->eventfd_async || io_wq_current_is_worker()) + eventfd_signal(ev_fd->cq_ev_fd, 1); +out: + rcu_read_unlock(); +} + +void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); +} + +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + io_commit_cqring_flush(ctx); + io_cqring_wake(ctx); +} + +static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + +void io_cq_unlock_post(struct io_ring_ctx *ctx) +{ + __io_cq_unlock_post(ctx); +} + +/* Returns true if there are no backlogged entries after the flush */ +static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +{ + bool all_flushed; + size_t cqe_size = sizeof(struct io_uring_cqe); + + if (!force && __io_cqring_events(ctx) == ctx->cq_entries) + return false; + + if (ctx->flags & IORING_SETUP_CQE32) + cqe_size <<= 1; + + io_cq_lock(ctx); + while (!list_empty(&ctx->cq_overflow_list)) { + struct io_uring_cqe *cqe = io_get_cqe(ctx); + struct io_overflow_cqe *ocqe; + + if (!cqe && !force) + break; + ocqe = list_first_entry(&ctx->cq_overflow_list, + struct io_overflow_cqe, list); + if (cqe) + memcpy(cqe, &ocqe->cqe, cqe_size); + else + io_account_cq_overflow(ctx); + + list_del(&ocqe->list); + kfree(ocqe); + } + + all_flushed = list_empty(&ctx->cq_overflow_list); + if (all_flushed) { + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); + } + + io_cq_unlock_post(ctx); + return all_flushed; +} + +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) +{ + bool ret = true; + + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { + /* iopoll syncs against uring_lock, not completion_lock */ + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); + ret = __io_cqring_overflow_flush(ctx, false); + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); + } + + return ret; +} + +static void __io_put_task(struct task_struct *task, int nr) +{ + struct io_uring_task *tctx = task->io_uring; + + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); +} + +static void io_task_refs_refill(struct io_uring_task *tctx) +{ + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; + + percpu_counter_add(&tctx->inflight, refill); + refcount_add(refill, ¤t->usage); + tctx->cached_refs += refill; +} + +static inline void io_get_task_refs(int nr) +{ + struct io_uring_task *tctx = current->io_uring; + + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); +} + +static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +{ + struct io_uring_task *tctx = task->io_uring; + unsigned int refs = tctx->cached_refs; + + if (refs) { + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); + } +} + +static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags, u64 extra1, u64 extra2) +{ + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); + if (!ocqe) { + /* + * If we're in ring overflow flush mode, or in task cancel mode, + * or cannot allocate an overflow entry, then we need to drop it + * on the floor. + */ + io_account_cq_overflow(ctx); + set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); + return false; + } + if (list_empty(&ctx->cq_overflow_list)) { + set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); + + } + ocqe->cqe.user_data = user_data; + ocqe->cqe.res = res; + ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } + list_add_tail(&ocqe->list, &ctx->cq_overflow_list); + return true; +} + +bool io_req_cqe_overflow(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_CQE32_INIT)) { + req->extra1 = 0; + req->extra2 = 0; + } + return io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->extra1, req->extra2); +} + +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int free, queued, len; + + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) + return NULL; + + if (ctx->flags & IORING_SETUP_CQE32) { + off <<= 1; + len <<= 1; + } + + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + + ctx->cached_cq_tail++; + ctx->cqe_cached++; + if (ctx->flags & IORING_SETUP_CQE32) + ctx->cqe_cached++; + return &rings->cqes[off]; +} + +static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags, + bool allow_overflow) +{ + struct io_uring_cqe *cqe; + + ctx->cq_extra++; + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); + + WRITE_ONCE(cqe->user_data, user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, cflags); + + if (ctx->flags & IORING_SETUP_CQE32) { + WRITE_ONCE(cqe->big_cqe[0], 0); + WRITE_ONCE(cqe->big_cqe[1], 0); + } + return true; + } + + if (allow_overflow) + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + + return false; +} + +bool io_post_aux_cqe(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags, + bool allow_overflow) +{ + bool filled; + + io_cq_lock(ctx); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow); + io_cq_unlock_post(ctx); + return filled; +} + +static void __io_req_complete_put(struct io_kiocb *req) +{ + /* + * If we're the last reference to this request, add to our locked + * free_list cache. + */ + if (req_ref_put_and_test(req)) { + struct io_ring_ctx *ctx = req->ctx; + + if (req->flags & IO_REQ_LINK_FLAGS) { + if (req->flags & IO_DISARM_MASK) + io_disarm_next(req); + if (req->link) { + io_req_task_queue(req->link); + req->link = NULL; + } + } + io_req_put_rsrc(req); + /* + * Selected buffer deallocation in io_clean_op() assumes that + * we don't hold ->completion_lock. Clean them here to avoid + * deadlocks. + */ + io_put_kbuf_comp(req); + io_dismantle_req(req); + io_put_task(req->task, 1); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); + ctx->locked_free_nr++; + } +} + +void __io_req_complete_post(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(req->ctx, req); + __io_req_complete_put(req); +} + +void io_req_complete_post(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + io_cq_lock(ctx); + __io_req_complete_post(req); + io_cq_unlock_post(ctx); +} + +inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) +{ + io_req_complete_post(req); +} + +void io_req_complete_failed(struct io_kiocb *req, s32 res) +{ + req_set_fail(req); + io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_complete_post(req); +} + +/* + * Don't initialise the fields below on every allocation, but do that in + * advance and keep them valid across allocations. + */ +static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + req->ctx = ctx; + req->link = NULL; + req->async_data = NULL; + /* not necessary, but safer to zero */ + req->cqe.res = 0; +} + +static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, + struct io_submit_state *state) +{ + spin_lock(&ctx->completion_lock); + wq_list_splice(&ctx->locked_free_list, &state->free_list); + ctx->locked_free_nr = 0; + spin_unlock(&ctx->completion_lock); +} + +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) +{ + return !ctx->submit_state.free_list.next; +} + +/* + * A request might get retired back into the request caches even before opcode + * handlers and io_issue_sqe() are done with it, e.g. inline completion path. + * Because of that, io_alloc_req() should be called only under ->uring_lock + * and with extra caution to not get a request that is still worked on. + */ +static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + void *reqs[IO_REQ_ALLOC_BATCH]; + int ret, i; + + /* + * If we have more than a batch's worth of requests in our IRQ side + * locked cache, grab the lock and move them over to our submission + * side cache. + */ + if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); + if (!io_req_cache_empty(ctx)) + return true; + } + + ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); + + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!reqs[0]) + return false; + ret = 1; + } + + percpu_ref_get_many(&ctx->refs, ret); + for (i = 0; i < ret; i++) { + struct io_kiocb *req = reqs[i]; + + io_preinit_req(req, ctx); + io_req_add_to_cache(req, ctx); + } + return true; +} + +static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +{ + if (unlikely(io_req_cache_empty(ctx))) + return __io_alloc_req_refill(ctx); + return true; +} + +static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +{ + struct io_wq_work_node *node; + + node = wq_stack_extract(&ctx->submit_state.free_list); + return container_of(node, struct io_kiocb, comp_list); +} + +static inline void io_dismantle_req(struct io_kiocb *req) +{ + unsigned int flags = req->flags; + + if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) + io_clean_op(req); + if (!(flags & REQ_F_FIXED_FILE)) + io_put_file(req->file); +} + +__cold void io_free_req(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + io_req_put_rsrc(req); + io_dismantle_req(req); + io_put_task(req->task, 1); + + spin_lock(&ctx->completion_lock); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); + ctx->locked_free_nr++; + spin_unlock(&ctx->completion_lock); +} + +static void __io_req_find_next_prep(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + io_cq_lock(ctx); + io_disarm_next(req); + io_cq_unlock_post(ctx); +} + +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt; + + /* + * If LINK is set, we have dependent requests in this chain. If we + * didn't fail this request, queue the first one up, moving any other + * dependencies to the next request. In case of failure, fail the rest + * of the chain. + */ + if (unlikely(req->flags & IO_DISARM_MASK)) + __io_req_find_next_prep(req); + nxt = req->link; + req->link = NULL; + return nxt; +} + +static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) +{ + if (!ctx) + return; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (*locked) { + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + *locked = false; + } + percpu_ref_put(&ctx->refs); +} + +static unsigned int handle_tw_list(struct llist_node *node, + struct io_ring_ctx **ctx, bool *locked, + struct llist_node *last) +{ + unsigned int count = 0; + + while (node != last) { + struct llist_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); + + if (req->ctx != *ctx) { + ctx_flush_and_put(*ctx, locked); + *ctx = req->ctx; + /* if not contended, grab and improve batching */ + *locked = mutex_trylock(&(*ctx)->uring_lock); + percpu_ref_get(&(*ctx)->refs); + } + req->io_task_work.func(req, locked); + node = next; + count++; + } + + return count; +} + +/** + * io_llist_xchg - swap all entries in a lock-less list + * @head: the head of lock-less list to delete all entries + * @new: new entry as the head of the list + * + * If list is empty, return NULL, otherwise, return the pointer to the first entry. + * The order of entries returned is from the newest to the oldest added one. + */ +static inline struct llist_node *io_llist_xchg(struct llist_head *head, + struct llist_node *new) +{ + return xchg(&head->first, new); +} + +/** + * io_llist_cmpxchg - possibly swap all entries in a lock-less list + * @head: the head of lock-less list to delete all entries + * @old: expected old value of the first entry of the list + * @new: new entry as the head of the list + * + * perform a cmpxchg on the first entry of the list. + */ + +static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, + struct llist_node *old, + struct llist_node *new) +{ + return cmpxchg(&head->first, old, new); +} + +void tctx_task_work(struct callback_head *cb) +{ + bool uring_locked = false; + struct io_ring_ctx *ctx = NULL; + struct io_uring_task *tctx = container_of(cb, struct io_uring_task, + task_work); + struct llist_node fake = {}; + struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake); + unsigned int loops = 1; + unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL); + + node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); + while (node != &fake) { + loops++; + node = io_llist_xchg(&tctx->task_list, &fake); + count += handle_tw_list(node, &ctx, &uring_locked, &fake); + node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); + } + + ctx_flush_and_put(ctx, &uring_locked); + + /* relaxed read is enough as only the task itself sets ->in_idle */ + if (unlikely(atomic_read(&tctx->in_idle))) + io_uring_drop_tctx_refs(current); + + trace_io_uring_task_work_run(tctx, count, loops); +} + +void io_req_task_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->task->io_uring; + struct io_ring_ctx *ctx = req->ctx; + struct llist_node *node; + bool running; + + running = !llist_add(&req->io_task_work.node, &tctx->task_list); + + /* task_work already pending, we're done */ + if (running) + return; + + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) + return; + + node = llist_del_all(&tctx->task_list); + + while (node) { + req = container_of(node, struct io_kiocb, io_task_work.node); + node = node->next; + if (llist_add(&req->io_task_work.node, + &req->ctx->fallback_llist)) + schedule_delayed_work(&req->ctx->fallback_work, 1); + } +} + +static void io_req_tw_post(struct io_kiocb *req, bool *locked) +{ + io_req_complete_post(req); +} + +void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) +{ + io_req_set_res(req, res, cflags); + req->io_task_work.func = io_req_tw_post; + io_req_task_work_add(req); +} + +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +{ + /* not needed for normal modes, but SQPOLL depends on it */ + io_tw_lock(req->ctx, locked); + io_req_complete_failed(req, req->cqe.res); +} + +void io_req_task_submit(struct io_kiocb *req, bool *locked) +{ + io_tw_lock(req->ctx, locked); + /* req->task == current here, checking PF_EXITING is safe */ + if (likely(!(req->task->flags & PF_EXITING))) + io_queue_sqe(req); + else + io_req_complete_failed(req, -EFAULT); +} + +void io_req_task_queue_fail(struct io_kiocb *req, int ret) |
