summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2022-09-30 07:47:42 -0600
committerJens Axboe <axboe@kernel.dk>2022-09-30 07:47:42 -0600
commit5853a7b5512c3017f64ca26494bd7361a12d6992 (patch)
tree5777f2d130ad2d7ee127df3eaaafc23ec73721ae
parent736feaa3a08124020afe6e51f50bae8598c99f55 (diff)
parent108893ddcc4d3aa0a4a02aeb02d478e997001227 (diff)
downloadlinux-5853a7b5512c3017f64ca26494bd7361a12d6992.tar.gz
linux-5853a7b5512c3017f64ca26494bd7361a12d6992.tar.bz2
linux-5853a7b5512c3017f64ca26494bd7361a12d6992.zip
Merge branch 'for-6.1/io_uring' into for-6.1/passthrough
* for-6.1/io_uring: (56 commits) io_uring/net: fix notif cqe reordering io_uring/net: don't update msg_name if not provided io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL io_uring/rw: defer fsnotify calls to task context io_uring/net: fix fast_iov assignment in io_setup_async_msg() io_uring/net: fix non-zc send with address io_uring/net: don't skip notifs for failed requests io_uring/rw: don't lose short results on io_setup_async_rw() io_uring/rw: fix unexpected link breakage io_uring/net: fix cleanup double free free_iov init io_uring: fix CQE reordering io_uring/net: fix UAF in io_sendrecv_fail() selftest/net: adjust io_uring sendzc notif handling io_uring: ensure local task_work marks task as running io_uring/net: zerocopy sendmsg io_uring/net: combine fail handlers io_uring/net: rename io_sendzc() io_uring/net: support non-zerocopy sendto io_uring/net: refactor io_setup_async_addr io_uring/net: don't lose partial send_zc on fail ...
-rw-r--r--block/blk-mq.c3
-rw-r--r--drivers/nvme/host/core.c1
-rw-r--r--drivers/nvme/host/ioctl.c77
-rw-r--r--drivers/nvme/host/multipath.c1
-rw-r--r--drivers/nvme/host/nvme.h4
-rw-r--r--fs/eventfd.c10
-rw-r--r--include/linux/blk-mq.h1
-rw-r--r--include/linux/eventfd.h2
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/io_uring.h8
-rw-r--r--include/linux/io_uring_types.h4
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/trace/events/io_uring.h29
-rw-r--r--include/uapi/linux/io_uring.h8
-rw-r--r--io_uring/cancel.c2
-rw-r--r--io_uring/fdinfo.c48
-rw-r--r--io_uring/io_uring.c304
-rw-r--r--io_uring/io_uring.h62
-rw-r--r--io_uring/kbuf.h12
-rw-r--r--io_uring/net.c308
-rw-r--r--io_uring/net.h12
-rw-r--r--io_uring/opdef.c44
-rw-r--r--io_uring/opdef.h1
-rw-r--r--io_uring/rsrc.c2
-rw-r--r--io_uring/rw.c189
-rw-r--r--io_uring/rw.h1
-rw-r--r--io_uring/timeout.c13
-rw-r--r--io_uring/timeout.h2
-rw-r--r--io_uring/uring_cmd.c11
-rw-r--r--tools/testing/selftests/net/io_uring_zerocopy_tx.c22
30 files changed, 859 insertions, 326 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b9f78d9726bc..83492d942348 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1241,7 +1241,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t ret)
complete(&wait->done);
}
-static bool blk_rq_is_poll(struct request *rq)
+bool blk_rq_is_poll(struct request *rq)
{
if (!rq->mq_hctx)
return false;
@@ -1251,6 +1251,7 @@ static bool blk_rq_is_poll(struct request *rq)
return false;
return true;
}
+EXPORT_SYMBOL_GPL(blk_rq_is_poll);
static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
{
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index be1d95a037fe..0f05b61a30fe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3976,6 +3976,7 @@ static const struct file_operations nvme_ns_chr_fops = {
.unlocked_ioctl = nvme_ns_chr_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.uring_cmd = nvme_ns_chr_uring_cmd,
+ .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
};
static int nvme_add_ns_cdev(struct nvme_ns *ns)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a48a79ed5c4c..357791ff0623 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -398,11 +398,19 @@ static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err)
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
/* extract bio before reusing the same field for request */
struct bio *bio = pdu->bio;
+ void *cookie = READ_ONCE(ioucmd->cookie);
pdu->req = req;
req->bio = bio;
- /* this takes care of moving rest of completion-work to task context */
- io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
+
+ /*
+ * For iopoll, complete it directly.
+ * Otherwise, move the completion to task work.
+ */
+ if (cookie != NULL && blk_rq_is_poll(req))
+ nvme_uring_task_cb(ioucmd);
+ else
+ io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
}
static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -452,7 +460,10 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
rq_flags = REQ_NOWAIT;
blk_flags = BLK_MQ_REQ_NOWAIT;
}
+ if (issue_flags & IO_URING_F_IOPOLL)
+ rq_flags |= REQ_POLLED;
+retry:
req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
d.data_len, nvme_to_user_ptr(d.metadata),
d.metadata_len, 0, &meta, d.timeout_ms ?
@@ -463,6 +474,17 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
req->end_io = nvme_uring_cmd_end_io;
req->end_io_data = ioucmd;
+ if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
+ if (unlikely(!req->bio)) {
+ /* we can't poll this, so alloc regular req instead */
+ blk_mq_free_request(req);
+ rq_flags &= ~REQ_POLLED;
+ goto retry;
+ } else {
+ WRITE_ONCE(ioucmd->cookie, req->bio);
+ req->bio->bi_opf |= REQ_POLLED;
+ }
+ }
/* to free bio on completion, as req->bio will be null at that time */
pdu->bio = req->bio;
pdu->meta = meta;
@@ -566,9 +588,6 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
static int nvme_uring_cmd_checks(unsigned int issue_flags)
{
- /* IOPOLL not supported yet */
- if (issue_flags & IO_URING_F_IOPOLL)
- return -EOPNOTSUPP;
/* NVMe passthrough requires big SQE/CQE support */
if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
@@ -611,6 +630,25 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
}
+int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
+ struct io_comp_batch *iob,
+ unsigned int poll_flags)
+{
+ struct bio *bio;
+ int ret = 0;
+ struct nvme_ns *ns;
+ struct request_queue *q;
+
+ rcu_read_lock();
+ bio = READ_ONCE(ioucmd->cookie);
+ ns = container_of(file_inode(ioucmd->file)->i_cdev,
+ struct nvme_ns, cdev);
+ q = ns->queue;
+ if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
+ ret = bio_poll(bio, iob, poll_flags);
+ rcu_read_unlock();
+ return ret;
+}
#ifdef CONFIG_NVME_MULTIPATH
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
void __user *argp, struct nvme_ns_head *head, int srcu_idx)
@@ -692,6 +730,31 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
}
+
+int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
+ struct io_comp_batch *iob,
+ unsigned int poll_flags)
+{
+ struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
+ struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
+ int srcu_idx = srcu_read_lock(&head->srcu);
+ struct nvme_ns *ns = nvme_find_path(head);
+ struct bio *bio;
+ int ret = 0;
+ struct request_queue *q;
+
+ if (ns) {
+ rcu_read_lock();
+ bio = READ_ONCE(ioucmd->cookie);
+ q = ns->queue;
+ if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio
+ && bio->bi_bdev)
+ ret = bio_poll(bio, iob, poll_flags);
+ rcu_read_unlock();
+ }
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
@@ -699,6 +762,10 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
struct nvme_ctrl *ctrl = ioucmd->file->private_data;
int ret;
+ /* IOPOLL not supported yet */
+ if (issue_flags & IO_URING_F_IOPOLL)
+ return -EOPNOTSUPP;
+
ret = nvme_uring_cmd_checks(issue_flags);
if (ret)
return ret;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6ef497c75a16..00f2f81e20fa 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -439,6 +439,7 @@ static const struct file_operations nvme_ns_head_chr_fops = {
.unlocked_ioctl = nvme_ns_head_chr_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.uring_cmd = nvme_ns_head_chr_uring_cmd,
+ .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll,
};
static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7902c5245363..a29877217ee6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -843,6 +843,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
long nvme_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
+int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
+ struct io_comp_batch *iob, unsigned int poll_flags);
+int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
+ struct io_comp_batch *iob, unsigned int poll_flags);
int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3627dd7d25db..c0ffee99ad23 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -69,17 +69,17 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(current->in_eventfd_signal))
+ if (WARN_ON_ONCE(current->in_eventfd))
return 0;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- current->in_eventfd_signal = 1;
+ current->in_eventfd = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- current->in_eventfd_signal = 0;
+ current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
@@ -253,8 +253,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
__set_current_state(TASK_RUNNING);
}
eventfd_ctx_do_read(ctx, &ucnt);
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
+ current->in_eventfd = 0;
spin_unlock_irq(&ctx->wqh.lock);
if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
return -EFAULT;
@@ -301,8 +303,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
}
if (likely(res > 0)) {
ctx->count += ucnt;
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e21ff85751cf..00a15808c137 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -987,6 +987,7 @@ int blk_rq_map_kern(struct request_queue *, struct request *, void *,
int blk_rq_append_bio(struct request *rq, struct bio *bio);
void blk_execute_rq_nowait(struct request *rq, bool at_head);
blk_status_t blk_execute_rq(struct request *rq, bool at_head);
+bool blk_rq_is_poll(struct request *rq);
struct req_iterator {
struct bvec_iter iter;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 305d5f19093b..30eb30d6909b 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -46,7 +46,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
static inline bool eventfd_signal_allowed(void)
{
- return !current->in_eventfd_signal;
+ return !current->in_eventfd;
}
#else /* CONFIG_EVENTFD */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..01681d061a6a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2132,6 +2132,8 @@ struct file_operations {
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
+ int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
+ unsigned int poll_flags);
} __randomize_layout;
struct inode_operations {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 4a2f6cc5a492..58676c0a398f 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -20,8 +20,12 @@ enum io_uring_cmd_flags {
struct io_uring_cmd {
struct file *file;
const void *cmd;
- /* callback to defer completions to task context */
- void (*task_work_cb)(struct io_uring_cmd *cmd);
+ union {
+ /* callback to defer completions to task context */
+ void (*task_work_cb)(struct io_uring_cmd *cmd);
+ /* used for polled completion */
+ void *cookie;
+ };
u32 cmd_op;
u32 pad;
u8 pdu[32]; /* available inline for free use */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 677a25d44d7f..aa4d90a53866 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -184,6 +184,8 @@ struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
+ atomic_t refs;
+ atomic_t ops;
};
struct io_alloc_cache {
@@ -301,6 +303,8 @@ struct io_ring_ctx {
struct io_hash_table cancel_table;
bool poll_multi_queue;
+ struct llist_head work_llist;
+
struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..8d82d6d32670 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -936,7 +936,7 @@ struct task_struct {
#endif
#ifdef CONFIG_EVENTFD
/* Recursion prevention for eventfd_signal() */
- unsigned in_eventfd_signal:1;
+ unsigned in_eventfd:1;
#endif
#ifdef CONFIG_IOMMU_SVA
unsigned pasid_activated:1;
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index c5b21ff0ac85..936fd41bf147 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -655,6 +655,35 @@ TRACE_EVENT(io_uring_short_write,
__entry->wanted, __entry->got)
);
+/*
+ * io_uring_local_work_run - ran ring local task work
+ *
+ * @tctx: pointer to a io_uring_ctx
+ * @count: how many functions it ran
+ * @loops: how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_local_work_run,
+
+ TP_PROTO(void *ctx, int count, unsigned int loops),
+
+ TP_ARGS(ctx, count, loops),
+
+ TP_STRUCT__entry (
+ __field(void *, ctx )
+ __field(int, count )
+ __field(unsigned int, loops )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->count = count;
+ __entry->loops = loops;
+ ),
+
+ TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops)
+);
+
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6b83177fd41d..92f29d9505a6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -157,6 +157,13 @@ enum {
*/
#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
+/*
+ * Defer running task work to get events.
+ * Rather than running bits of task work whenever the task transitions
+ * try to do it just before it is needed.
+ */
+#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -206,6 +213,7 @@ enum io_uring_op {
IORING_OP_SOCKET,
IORING_OP_URING_CMD,
IORING_OP_SEND_ZC,
+ IORING_OP_SENDMSG_ZC,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 5fc5d3e80fcb..2291a53cdabd 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -292,7 +292,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
break;
mutex_unlock(&ctx->uring_lock);
- ret = io_run_task_work_sig();
+ ret = io_run_task_work_sig(ctx);
if (ret < 0) {
mutex_lock(&ctx->uring_lock);
break;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index b29e2d02216f..4eae088046d0 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -60,13 +60,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
unsigned int cq_shift = 0;
+ unsigned int sq_shift = 0;
unsigned int sq_entries, cq_entries;
bool has_lock;
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
unsigned int i;
- if (is_cqe32)
+ if (ctx->flags & IORING_SETUP_CQE32)
cq_shift = 1;
+ if (ctx->flags & IORING_SETUP_SQE128)
+ sq_shift = 1;
/*
* we may get imprecise sqe and cqe info if uring is actively running
@@ -82,19 +84,36 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
seq_printf(m, "CqHead:\t%u\n", cq_head);
seq_printf(m, "CqTail:\t%u\n", cq_tail);
seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
- seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+ seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
for (i = 0; i < sq_entries; i++) {
unsigned int entry = i + sq_head;
- unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
struct io_uring_sqe *sqe;
+ unsigned int sq_idx;
+ sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
if (sq_idx > sq_mask)
continue;
- sqe = &ctx->sq_sqes[sq_idx];
- seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
- sq_idx, sqe->opcode, sqe->fd, sqe->flags,
- sqe->user_data);
+ sqe = &ctx->sq_sqes[sq_idx << 1];
+ seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
+ "addr:0x%llx, rw_flags:0x%x, buf_index:%d "
+ "user_data:%llu",
+ sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
+ sqe->flags, (unsigned long long) sqe->off,
+ (unsigned long long) sqe->addr, sqe->rw_flags,
+ sqe->buf_index, sqe->user_data);
+ if (sq_shift) {
+ u64 *sqeb = (void *) (sqe + 1);
+ int size = sizeof(struct io_uring_sqe) / sizeof(u64);
+ int j;
+
+ for (j = 0; j < size; j++) {
+ seq_printf(m, ", e%d:0x%llx", j,
+ (unsigned long long) *sqeb);
+ sqeb++;
+ }
+ }
+ seq_printf(m, "\n");
}
seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
@@ -102,16 +121,13 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
- if (!is_cqe32) {
- seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
- } else {
- seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
- "extra1:%llu, extra2:%llu\n",
- entry & cq_mask, cqe->user_data, cqe->res,
- cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
- }
+ if (cq_shift)
+ seq_printf(m, ", extra1:%llu, extra2:%llu\n",
+ cqe->big_cqe[0], cqe->big_cqe[1]);
+ seq_printf(m, "\n");
}
/*
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 75ecc47e3cd5..6e44186ba695 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -125,6 +125,11 @@ enum {
IO_CHECK_CQ_DROPPED_BIT,
};
+enum {
+ IO_EVENTFD_OP_SIGNAL_BIT,
+ IO_EVENTFD_OP_FREE_BIT,
+};
+
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -142,7 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_dismantle_req(struct io_kiocb *req);
static void io_clean_op(struct io_kiocb *req);
static void io_queue_sqe(struct io_kiocb *req);
-
+static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -316,6 +321,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
+ init_llist_head(&ctx->work_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
@@ -477,25 +483,28 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-static void io_eventfd_signal(struct io_ring_ctx *ctx)
+
+static void io_eventfd_ops(struct rcu_head *rcu)
{
- struct io_ev_fd *ev_fd;
- bool skip;
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+ int ops = atomic_xchg(&ev_fd->ops, 0);
- spin_lock(&ctx->completion_lock);
- /*
- * Eventfd should only get triggered when at least one event has been
- * posted. Some applications rely on the eventfd notification count only
- * changing IFF a new CQE has been added to the CQ ring. There's no
- * depedency on 1:1 relationship between how many times this function is
- * called (and hence the eventfd count) and number of CQEs posted to the
- * CQ ring.
+ if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+
+ /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
+ * ordering in a race but if references are 0 we know we have to free
+ * it regardless.
*/
- skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
- if (skip)
- return;
+ if (atomic_dec_and_test(&ev_fd->refs)) {
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+ }
+}
+
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd = NULL;
rcu_read_lock();
/*
@@ -513,13 +522,46 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
goto out;
+ if (ev_fd->eventfd_async && !io_wq_current_is_worker())
+ goto out;
- if (!ev_fd->eventfd_async || io_wq_current_is_worker())
+ if (likely(eventfd_signal_allowed())) {
eventfd_signal(ev_fd->cq_ev_fd, 1);
+ } else {
+ atomic_inc(&ev_fd->refs);
+ if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
+ call_rcu(&ev_fd->rcu, io_eventfd_ops);
+ else
+ atomic_dec(&ev_fd->refs);
+ }
+
out:
rcu_read_unlock();
}
+static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
+{
+ bool skip;
+
+ spin_lock(&ctx->completion_lock);
+
+ /*
+ * Eventfd should only get triggered when at least one event has been
+ * posted. Some applications rely on the eventfd notification count
+ * only changing IFF a new CQE has been added to the CQ ring. There's
+ * no depedency on 1:1 relationship between how many times this
+ * function is called (and hence the eventfd count) and number of CQEs
+ * posted to the CQ ring.
+ */
+ skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
+ ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+ spin_unlock(&ctx->completion_lock);
+ if (skip)
+ return;
+
+ io_eventfd_signal(ctx);
+}
+
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->off_timeout_used || ctx->drain_active) {
@@ -531,7 +573,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
spin_unlock(&ctx->completion_lock);
}
if (ctx->has_evfd)
- io_eventfd_signal(ctx);
+ io_eventfd_flush_signal(ctx);
}
static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
@@ -567,7 +609,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
io_cq_lock(ctx);
while (!list_empty(&ctx->cq_overflow_list)) {
- struct io_uring_cqe *cqe = io_get_cqe(ctx);
+ struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
struct io_overflow_cqe *ocqe;
if (!cqe && !force)
@@ -694,12 +736,19 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
+struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
unsigned int free, queued, len;
+ /*
+ * Posting into the CQ when there are pending overflowed CQEs may break
+ * ordering guarantees, which will affect links, F_MORE users and more.
+ * Force overflow the completion.
+ */
+ if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
+ return NULL;
/* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
@@ -823,8 +872,12 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+
req_set_fail(req);
io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
+ if (def->fail)
+ def->fail(req);
io_req_complete_post(req);
}
@@ -1047,17 +1100,40 @@ void tctx_task_work(struct callback_head *cb)
trace_io_uring_task_work_run(tctx, count, loops);
}
-void io_req_task_work_add(struct io_kiocb *req)
+static void io_req_local_work_add(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
+ return;
+
+ if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
+ io_move_task_work_from_local(ctx);
+ return;
+ }
+
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
+ if (ctx->has_evfd)
+ io_eventfd_signal(ctx);
+ io_cqring_wake(ctx);
+
+}
+
+static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
struct llist_node *node;
- bool running;
- running = !llist_add(&req->io_task_work.node, &tctx->task_list);
+ if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+ io_req_local_work_add(req);
+ return;
+ }
/* task_work already pending, we're done */
- if (running)
+ if (!llist_add(&req->io_task_work.node, &tctx->task_list))
return;
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
@@ -1077,6 +1153,84 @@ void io_req_task_work_add(struct io_kiocb *req)
}
}
+void io_req_task_work_add(struct io_kiocb *req)
+{
+ __io_req_task_work_add(req, true);
+}
+
+static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&ctx->work_llist);
+ while (node) {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
+
+ node = node->next;
+ __io_req_task_work_add(req, false);
+ }
+}
+
+int __io_run_local_work(struct io_ring_ctx *ctx, bool locked)
+{
+ struct llist_node *node;
+ struct llist_node fake;
+ struct llist_node *current_final = NULL;
+ int ret;
+ unsigned int loops = 1;
+
+ if (unlikely(ctx->submitter_task != current))
+ return -EEXIST;
+
+ node = io_llist_xchg(&ctx->work_llist, &fake);
+ ret = 0;
+again:
+ while (node != current_final) {
+ struct llist_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
+ prefetch(container_of(next, struct io_kiocb, io_task_work.node));