diff options
| author | Jens Axboe <axboe@kernel.dk> | 2022-09-30 07:47:42 -0600 |
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2022-09-30 07:47:42 -0600 |
| commit | 5853a7b5512c3017f64ca26494bd7361a12d6992 (patch) | |
| tree | 5777f2d130ad2d7ee127df3eaaafc23ec73721ae | |
| parent | 736feaa3a08124020afe6e51f50bae8598c99f55 (diff) | |
| parent | 108893ddcc4d3aa0a4a02aeb02d478e997001227 (diff) | |
| download | linux-5853a7b5512c3017f64ca26494bd7361a12d6992.tar.gz linux-5853a7b5512c3017f64ca26494bd7361a12d6992.tar.bz2 linux-5853a7b5512c3017f64ca26494bd7361a12d6992.zip | |
Merge branch 'for-6.1/io_uring' into for-6.1/passthrough
* for-6.1/io_uring: (56 commits)
io_uring/net: fix notif cqe reordering
io_uring/net: don't update msg_name if not provided
io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL
io_uring/rw: defer fsnotify calls to task context
io_uring/net: fix fast_iov assignment in io_setup_async_msg()
io_uring/net: fix non-zc send with address
io_uring/net: don't skip notifs for failed requests
io_uring/rw: don't lose short results on io_setup_async_rw()
io_uring/rw: fix unexpected link breakage
io_uring/net: fix cleanup double free free_iov init
io_uring: fix CQE reordering
io_uring/net: fix UAF in io_sendrecv_fail()
selftest/net: adjust io_uring sendzc notif handling
io_uring: ensure local task_work marks task as running
io_uring/net: zerocopy sendmsg
io_uring/net: combine fail handlers
io_uring/net: rename io_sendzc()
io_uring/net: support non-zerocopy sendto
io_uring/net: refactor io_setup_async_addr
io_uring/net: don't lose partial send_zc on fail
...
| -rw-r--r-- | block/blk-mq.c | 3 | ||||
| -rw-r--r-- | drivers/nvme/host/core.c | 1 | ||||
| -rw-r--r-- | drivers/nvme/host/ioctl.c | 77 | ||||
| -rw-r--r-- | drivers/nvme/host/multipath.c | 1 | ||||
| -rw-r--r-- | drivers/nvme/host/nvme.h | 4 | ||||
| -rw-r--r-- | fs/eventfd.c | 10 | ||||
| -rw-r--r-- | include/linux/blk-mq.h | 1 | ||||
| -rw-r--r-- | include/linux/eventfd.h | 2 | ||||
| -rw-r--r-- | include/linux/fs.h | 2 | ||||
| -rw-r--r-- | include/linux/io_uring.h | 8 | ||||
| -rw-r--r-- | include/linux/io_uring_types.h | 4 | ||||
| -rw-r--r-- | include/linux/sched.h | 2 | ||||
| -rw-r--r-- | include/trace/events/io_uring.h | 29 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 8 | ||||
| -rw-r--r-- | io_uring/cancel.c | 2 | ||||
| -rw-r--r-- | io_uring/fdinfo.c | 48 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 304 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 62 | ||||
| -rw-r--r-- | io_uring/kbuf.h | 12 | ||||
| -rw-r--r-- | io_uring/net.c | 308 | ||||
| -rw-r--r-- | io_uring/net.h | 12 | ||||
| -rw-r--r-- | io_uring/opdef.c | 44 | ||||
| -rw-r--r-- | io_uring/opdef.h | 1 | ||||
| -rw-r--r-- | io_uring/rsrc.c | 2 | ||||
| -rw-r--r-- | io_uring/rw.c | 189 | ||||
| -rw-r--r-- | io_uring/rw.h | 1 | ||||
| -rw-r--r-- | io_uring/timeout.c | 13 | ||||
| -rw-r--r-- | io_uring/timeout.h | 2 | ||||
| -rw-r--r-- | io_uring/uring_cmd.c | 11 | ||||
| -rw-r--r-- | tools/testing/selftests/net/io_uring_zerocopy_tx.c | 22 |
30 files changed, 859 insertions, 326 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index b9f78d9726bc..83492d942348 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1241,7 +1241,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t ret) complete(&wait->done); } -static bool blk_rq_is_poll(struct request *rq) +bool blk_rq_is_poll(struct request *rq) { if (!rq->mq_hctx) return false; @@ -1251,6 +1251,7 @@ static bool blk_rq_is_poll(struct request *rq) return false; return true; } +EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index be1d95a037fe..0f05b61a30fe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3976,6 +3976,7 @@ static const struct file_operations nvme_ns_chr_fops = { .unlocked_ioctl = nvme_ns_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, .uring_cmd = nvme_ns_chr_uring_cmd, + .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, }; static int nvme_add_ns_cdev(struct nvme_ns *ns) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index a48a79ed5c4c..357791ff0623 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -398,11 +398,19 @@ static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err) struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); /* extract bio before reusing the same field for request */ struct bio *bio = pdu->bio; + void *cookie = READ_ONCE(ioucmd->cookie); pdu->req = req; req->bio = bio; - /* this takes care of moving rest of completion-work to task context */ - io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); + + /* + * For iopoll, complete it directly. + * Otherwise, move the completion to task work. + */ + if (cookie != NULL && blk_rq_is_poll(req)) + nvme_uring_task_cb(ioucmd); + else + io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); } static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -452,7 +460,10 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, rq_flags = REQ_NOWAIT; blk_flags = BLK_MQ_REQ_NOWAIT; } + if (issue_flags & IO_URING_F_IOPOLL) + rq_flags |= REQ_POLLED; +retry: req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr), d.data_len, nvme_to_user_ptr(d.metadata), d.metadata_len, 0, &meta, d.timeout_ms ? @@ -463,6 +474,17 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, req->end_io = nvme_uring_cmd_end_io; req->end_io_data = ioucmd; + if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { + if (unlikely(!req->bio)) { + /* we can't poll this, so alloc regular req instead */ + blk_mq_free_request(req); + rq_flags &= ~REQ_POLLED; + goto retry; + } else { + WRITE_ONCE(ioucmd->cookie, req->bio); + req->bio->bi_opf |= REQ_POLLED; + } + } /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; pdu->meta = meta; @@ -566,9 +588,6 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static int nvme_uring_cmd_checks(unsigned int issue_flags) { - /* IOPOLL not supported yet */ - if (issue_flags & IO_URING_F_IOPOLL) - return -EOPNOTSUPP; /* NVMe passthrough requires big SQE/CQE support */ if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != @@ -611,6 +630,25 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); } +int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, + struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct bio *bio; + int ret = 0; + struct nvme_ns *ns; + struct request_queue *q; + + rcu_read_lock(); + bio = READ_ONCE(ioucmd->cookie); + ns = container_of(file_inode(ioucmd->file)->i_cdev, + struct nvme_ns, cdev); + q = ns->queue; + if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev) + ret = bio_poll(bio, iob, poll_flags); + rcu_read_unlock(); + return ret; +} #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp, struct nvme_ns_head *head, int srcu_idx) @@ -692,6 +730,31 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, srcu_read_unlock(&head->srcu, srcu_idx); return ret; } + +int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, + struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + struct bio *bio; + int ret = 0; + struct request_queue *q; + + if (ns) { + rcu_read_lock(); + bio = READ_ONCE(ioucmd->cookie); + q = ns->queue; + if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio + && bio->bi_bdev) + ret = bio_poll(bio, iob, poll_flags); + rcu_read_unlock(); + } + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) @@ -699,6 +762,10 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) struct nvme_ctrl *ctrl = ioucmd->file->private_data; int ret; + /* IOPOLL not supported yet */ + if (issue_flags & IO_URING_F_IOPOLL) + return -EOPNOTSUPP; + ret = nvme_uring_cmd_checks(issue_flags); if (ret) return ret; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 6ef497c75a16..00f2f81e20fa 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -439,6 +439,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, .uring_cmd = nvme_ns_head_chr_uring_cmd, + .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7902c5245363..a29877217ee6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -843,6 +843,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, + struct io_comp_batch *iob, unsigned int poll_flags); +int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, + struct io_comp_batch *iob, unsigned int poll_flags); int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, diff --git a/fs/eventfd.c b/fs/eventfd.c index 3627dd7d25db..c0ffee99ad23 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -69,17 +69,17 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(current->in_eventfd_signal)) + if (WARN_ON_ONCE(current->in_eventfd)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - current->in_eventfd_signal = 1; + current->in_eventfd = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - current->in_eventfd_signal = 0; + current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; @@ -253,8 +253,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) __set_current_state(TASK_RUNNING); } eventfd_ctx_do_read(ctx, &ucnt); + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); + current->in_eventfd = 0; spin_unlock_irq(&ctx->wqh.lock); if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) return -EFAULT; @@ -301,8 +303,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c } if (likely(res > 0)) { ctx->count += ucnt; + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); + current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e21ff85751cf..00a15808c137 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -987,6 +987,7 @@ int blk_rq_map_kern(struct request_queue *, struct request *, void *, int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); +bool blk_rq_is_poll(struct request *rq); struct req_iterator { struct bvec_iter iter; diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 305d5f19093b..30eb30d6909b 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -46,7 +46,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); static inline bool eventfd_signal_allowed(void) { - return !current->in_eventfd_signal; + return !current->in_eventfd; } #else /* CONFIG_EVENTFD */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286e..01681d061a6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2132,6 +2132,8 @@ struct file_operations { loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); + int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, + unsigned int poll_flags); } __randomize_layout; struct inode_operations { diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 4a2f6cc5a492..58676c0a398f 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -20,8 +20,12 @@ enum io_uring_cmd_flags { struct io_uring_cmd { struct file *file; const void *cmd; - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd); + union { + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd); + /* used for polled completion */ + void *cookie; + }; u32 cmd_op; u32 pad; u8 pdu[32]; /* available inline for free use */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 677a25d44d7f..aa4d90a53866 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -184,6 +184,8 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; unsigned int eventfd_async: 1; struct rcu_head rcu; + atomic_t refs; + atomic_t ops; }; struct io_alloc_cache { @@ -301,6 +303,8 @@ struct io_ring_ctx { struct io_hash_table cancel_table; bool poll_multi_queue; + struct llist_head work_llist; + struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..8d82d6d32670 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -936,7 +936,7 @@ struct task_struct { #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ - unsigned in_eventfd_signal:1; + unsigned in_eventfd:1; #endif #ifdef CONFIG_IOMMU_SVA unsigned pasid_activated:1; diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index c5b21ff0ac85..936fd41bf147 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -655,6 +655,35 @@ TRACE_EVENT(io_uring_short_write, __entry->wanted, __entry->got) ); +/* + * io_uring_local_work_run - ran ring local task work + * + * @tctx: pointer to a io_uring_ctx + * @count: how many functions it ran + * @loops: how many loops it ran + * + */ +TRACE_EVENT(io_uring_local_work_run, + + TP_PROTO(void *ctx, int count, unsigned int loops), + + TP_ARGS(ctx, count, loops), + + TP_STRUCT__entry ( + __field(void *, ctx ) + __field(int, count ) + __field(unsigned int, loops ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->count = count; + __entry->loops = loops; + ), + + TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6b83177fd41d..92f29d9505a6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -157,6 +157,13 @@ enum { */ #define IORING_SETUP_SINGLE_ISSUER (1U << 12) +/* + * Defer running task work to get events. + * Rather than running bits of task work whenever the task transitions + * try to do it just before it is needed. + */ +#define IORING_SETUP_DEFER_TASKRUN (1U << 13) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -206,6 +213,7 @@ enum io_uring_op { IORING_OP_SOCKET, IORING_OP_URING_CMD, IORING_OP_SEND_ZC, + IORING_OP_SENDMSG_ZC, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 5fc5d3e80fcb..2291a53cdabd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -292,7 +292,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) break; mutex_unlock(&ctx->uring_lock); - ret = io_run_task_work_sig(); + ret = io_run_task_work_sig(ctx); if (ret < 0) { mutex_lock(&ctx->uring_lock); break; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b29e2d02216f..4eae088046d0 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -60,13 +60,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); unsigned int cq_shift = 0; + unsigned int sq_shift = 0; unsigned int sq_entries, cq_entries; bool has_lock; - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); unsigned int i; - if (is_cqe32) + if (ctx->flags & IORING_SETUP_CQE32) cq_shift = 1; + if (ctx->flags & IORING_SETUP_SQE128) + sq_shift = 1; /* * we may get imprecise sqe and cqe info if uring is actively running @@ -82,19 +84,36 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, seq_printf(m, "CqHead:\t%u\n", cq_head); seq_printf(m, "CqTail:\t%u\n", cq_tail); seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); - seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); + seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { unsigned int entry = i + sq_head; - unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); struct io_uring_sqe *sqe; + unsigned int sq_idx; + sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; - sqe = &ctx->sq_sqes[sq_idx]; - seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", - sq_idx, sqe->opcode, sqe->fd, sqe->flags, - sqe->user_data); + sqe = &ctx->sq_sqes[sq_idx << 1]; + seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " + "addr:0x%llx, rw_flags:0x%x, buf_index:%d " + "user_data:%llu", + sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, + sqe->flags, (unsigned long long) sqe->off, + (unsigned long long) sqe->addr, sqe->rw_flags, + sqe->buf_index, sqe->user_data); + if (sq_shift) { + u64 *sqeb = (void *) (sqe + 1); + int size = sizeof(struct io_uring_sqe) / sizeof(u64); + int j; + + for (j = 0; j < size; j++) { + seq_printf(m, ", e%d:0x%llx", j, + (unsigned long long) *sqeb); + sqeb++; + } + } + seq_printf(m, "\n"); } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); cq_entries = min(cq_tail - cq_head, ctx->cq_entries); @@ -102,16 +121,13 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int entry = i + cq_head; struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - if (!is_cqe32) { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", entry & cq_mask, cqe->user_data, cqe->res, cqe->flags); - } else { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " - "extra1:%llu, extra2:%llu\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); - } + if (cq_shift) + seq_printf(m, ", extra1:%llu, extra2:%llu\n", + cqe->big_cqe[0], cqe->big_cqe[1]); + seq_printf(m, "\n"); } /* diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 75ecc47e3cd5..6e44186ba695 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -125,6 +125,11 @@ enum { IO_CHECK_CQ_DROPPED_BIT, }; +enum { + IO_EVENTFD_OP_SIGNAL_BIT, + IO_EVENTFD_OP_FREE_BIT, +}; + struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -142,7 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_dismantle_req(struct io_kiocb *req); static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); - +static void io_move_task_work_from_local(struct io_ring_ctx *ctx); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; @@ -316,6 +321,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); + init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_WQ_LIST(&ctx->locked_free_list); @@ -477,25 +483,28 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -static void io_eventfd_signal(struct io_ring_ctx *ctx) + +static void io_eventfd_ops(struct rcu_head *rcu) { - struct io_ev_fd *ev_fd; - bool skip; + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + int ops = atomic_xchg(&ev_fd->ops, 0); - spin_lock(&ctx->completion_lock); - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count only - * changing IFF a new CQE has been added to the CQ ring. There's no - * depedency on 1:1 relationship between how many times this function is - * called (and hence the eventfd count) and number of CQEs posted to the - * CQ ring. + if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) + eventfd_signal(ev_fd->cq_ev_fd, 1); + + /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback + * ordering in a race but if references are 0 we know we have to free + * it regardless. */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; + if (atomic_dec_and_test(&ev_fd->refs)) { + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); + } +} + +static void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd = NULL; rcu_read_lock(); /* @@ -513,13 +522,46 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) goto out; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) goto out; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + goto out; - if (!ev_fd->eventfd_async || io_wq_current_is_worker()) + if (likely(eventfd_signal_allowed())) { eventfd_signal(ev_fd->cq_ev_fd, 1); + } else { + atomic_inc(&ev_fd->refs); + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) + call_rcu(&ev_fd->rcu, io_eventfd_ops); + else + atomic_dec(&ev_fd->refs); + } + out: rcu_read_unlock(); } +static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +{ + bool skip; + + spin_lock(&ctx->completion_lock); + + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count + * only changing IFF a new CQE has been added to the CQ ring. There's + * no depedency on 1:1 relationship between how many times this + * function is called (and hence the eventfd count) and number of CQEs + * posted to the CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; + + io_eventfd_signal(ctx); +} + void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used || ctx->drain_active) { @@ -531,7 +573,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) spin_unlock(&ctx->completion_lock); } if (ctx->has_evfd) - io_eventfd_signal(ctx); + io_eventfd_flush_signal(ctx); } static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) @@ -567,7 +609,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe(ctx); + struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); struct io_overflow_cqe *ocqe; if (!cqe && !force) @@ -694,12 +736,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); unsigned int free, queued, len; + /* + * Posting into the CQ when there are pending overflowed CQEs may break + * ordering guarantees, which will affect links, F_MORE users and more. + * Force overflow the completion. + */ + if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) + return NULL; /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); @@ -823,8 +872,12 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) void io_req_complete_failed(struct io_kiocb *req, s32 res) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + req_set_fail(req); io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + if (def->fail) + def->fail(req); io_req_complete_post(req); } @@ -1047,17 +1100,40 @@ void tctx_task_work(struct callback_head *cb) trace_io_uring_task_work_run(tctx, count, loops); } -void io_req_task_work_add(struct io_kiocb *req) +static void io_req_local_work_add(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) + return; + + if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { + io_move_task_work_from_local(ctx); + return; + } + + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + if (ctx->has_evfd) + io_eventfd_signal(ctx); + io_cqring_wake(ctx); + +} + +static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; struct llist_node *node; - bool running; - running = !llist_add(&req->io_task_work.node, &tctx->task_list); + if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + io_req_local_work_add(req); + return; + } /* task_work already pending, we're done */ - if (running) + if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) @@ -1077,6 +1153,84 @@ void io_req_task_work_add(struct io_kiocb *req) } } +void io_req_task_work_add(struct io_kiocb *req) +{ + __io_req_task_work_add(req, true); +} + +static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) +{ + struct llist_node *node; + + node = llist_del_all(&ctx->work_llist); + while (node) { + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + node = node->next; + __io_req_task_work_add(req, false); + } +} + +int __io_run_local_work(struct io_ring_ctx *ctx, bool locked) +{ + struct llist_node *node; + struct llist_node fake; + struct llist_node *current_final = NULL; + int ret; + unsigned int loops = 1; + + if (unlikely(ctx->submitter_task != current)) + return -EEXIST; + + node = io_llist_xchg(&ctx->work_llist, &fake); + ret = 0; +again: + while (node != current_final) { + struct llist_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); |
