diff options
37 files changed, 1001 insertions, 452 deletions
diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networking/iou-zcrx.rst index 0127319b30bb..54a72e172bdc 100644 --- a/Documentation/networking/iou-zcrx.rst +++ b/Documentation/networking/iou-zcrx.rst @@ -75,7 +75,7 @@ Create an io_uring instance with the following required setup flags:: IORING_SETUP_SINGLE_ISSUER IORING_SETUP_DEFER_TASKRUN - IORING_SETUP_CQE32 + IORING_SETUP_CQE32 or IORING_SETUP_CQE_MIXED Create memory area ------------------ diff --git a/block/ioctl.c b/block/ioctl.c index f7b0006ca45d..c9ea8e53871e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -776,7 +776,7 @@ static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else - io_uring_cmd_done(cmd, bic->res, 0, issue_flags); + io_uring_cmd_done(cmd, bic->res, issue_flags); } static void bio_cmd_bio_end_io(struct bio *bio) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 67d4a867aec4..8fdc26a61104 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1189,7 +1189,7 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); /* tell ublksrv one io request is coming */ - io_uring_cmd_done(cmd, res, 0, issue_flags); + io_uring_cmd_done(cmd, res, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -1873,7 +1873,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, spin_unlock(&ubq->cancel_lock); if (!done) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } /* @@ -2520,7 +2520,7 @@ static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); if (ret != -EIOCBQUEUED) - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 6b3ac8ae3f34..e28bb9113f64 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, if (pdu->bio) blk_rq_unmap_user(pdu->bio); - io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a454b5ba2097..185bef0df1c2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4695,7 +4695,7 @@ out: btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..a30c44234a4e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -351,7 +351,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); if (req) fuse_uring_stop_fuse_req_end(req); @@ -518,7 +518,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, if (need_cmd_done) { /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); } } @@ -733,7 +733,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, 0, 0, issue_flags); + io_uring_cmd_done(cmd, 0, issue_flags); return 0; } @@ -1200,7 +1200,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); } /* diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cfa6d0c0c322..7509025b4071 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,11 +11,14 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) +typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, + unsigned issue_flags); + struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ @@ -53,11 +56,11 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, - unsigned issue_flags); +void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags); /* @@ -70,6 +73,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, /* Execute the request from a blocking context */ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); +/* + * Select a buffer from the provided buffer group for multishot uring_cmd. + * Returns the selected buffer address and size. + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags); + +/* + * Complete a multishot uring_cmd event. This will post a CQE to the completion + * queue and update the provided buffer. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -86,13 +104,12 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - u64 ret2, unsigned issue_flags) +static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, + u64 ret2, unsigned issue_flags, bool is_cqe32) { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), - unsigned flags) + io_uring_cmd_tw_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -102,28 +119,28 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { } -#endif - -/* - * Polled completions must ensure they are coming from a poll queue, and - * hence are completed inside the usual poll handling loops. - */ -static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, - ssize_t ret, ssize_t res2) +static inline struct io_br_sel +io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, + size_t *len, unsigned int issue_flags) +{ + return (struct io_br_sel) { .val = -EOPNOTSUPP }; +} +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) { - lockdep_assert(in_task()); - io_uring_cmd_done(ioucmd, ret, res2, 0); + return true; } +#endif /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } @@ -142,6 +159,18 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) return cmd_to_io_kiocb(cmd)->ctx; } +static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, + unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false); +} + +static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, + u64 res2, unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); +} + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 12f5ee43850e..c2ea6280901d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -86,6 +86,25 @@ struct io_mapped_region { }; /* + * Return value from io_buffer_list selection, to avoid stashing it in + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference + * across execution contexts are fine. But for ring provided buffers, the + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb + * persists, it's better to just keep the buffer local for those cases. + */ +struct io_br_sel { + struct io_buffer_list *buf_list; + /* + * Some selection parts return the user address, others return an error. + */ + union { + void __user *addr; + ssize_t val; + }; +}; + + +/* * Arbitrary limit, can be raised if need be */ #define IO_RINGFD_REG_MAX 16 @@ -671,12 +690,6 @@ struct io_kiocb { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - struct io_rsrc_node *buf_node; }; @@ -724,10 +737,4 @@ struct io_overflow_cqe { struct list_head list; struct io_uring_cqe cqe; }; - -static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) -{ - return ctx->flags & IORING_SETUP_CQE32; -} - #endif diff --git a/include/linux/poison.h b/include/linux/poison.h index 8ca2235f78d5..299e2dd7da6d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -90,4 +90,7 @@ /********** lib/stackdepot.c **********/ #define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) +/********** io_uring/ **********/ +#define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA)) + #endif diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 178ab6f611be..45d15460b495 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -340,8 +340,8 @@ TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; - __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; - __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[0] : 0; + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..a0cc1cc0dd01 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -298,9 +304,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* @@ -454,6 +464,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) #define IORING_NOP_TW (1U << 4) +#define IORING_NOP_CQE32 (1U << 5) /* * IO completion data structure (Completion Queue Entry) @@ -487,12 +498,22 @@ struct io_uring_cqe { * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application. + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + * CQE. It's only purpose is to fill a gap in the ring, + * if a large CQE is attempted posted when the ring has + * just a single small CQE worth of space left before + * wrapping. + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + * setup in a mixed CQE mode, where both 16b and 32b + * CQEs may be posted to the CQ ring. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4) +#define IORING_CQE_F_SKIP (1U << 5) +#define IORING_CQE_F_32 (1U << 15) #define IORING_CQE_BUFFER_SHIFT 16 @@ -665,6 +686,12 @@ enum io_uring_register_op { IORING_REGISTER_MEM_REGION = 34, + /* query various aspects of io_uring, see linux/io_uring/query.h */ + IORING_REGISTER_QUERY = 35, + + /* return zcrx buffers back into circulation */ + IORING_REGISTER_ZCRX_REFILL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1046,6 +1073,15 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h new file mode 100644 index 000000000000..5d754322a27c --- /dev/null +++ b/include/uapi/linux/io_uring/query.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring query interface. + */ +#ifndef LINUX_IO_URING_QUERY_H +#define LINUX_IO_URING_QUERY_H + +#include <linux/types.h> + +struct io_uring_query_hdr { + __u64 next_entry; + __u64 query_data; + __u32 query_op; + __u32 size; + __s32 result; + __u32 __resv[3]; +}; + +enum { + IO_URING_QUERY_OPCODES = 0, + + __IO_URING_QUERY_MAX, +}; + +/* Doesn't require a ring */ +struct io_uring_query_opcode { + /* The number of supported IORING_OP_* opcodes */ + __u32 nr_request_opcodes; + /* The number of supported IORING_[UN]REGISTER_* opcodes */ + __u32 nr_register_opcodes; + /* Bitmask of all supported IORING_FEAT_* flags */ + __u64 feature_flags; + /* Bitmask of all supported IORING_SETUP_* flags */ + __u64 ring_setup_flags; + /* Bitmask of all supported IORING_ENTER_** flags */ + __u64 enter_flags; + /* Bitmask of all supported IOSQE_* flags */ + __u64 sqe_flags; +}; + +#endif diff --git a/io_uring/Makefile b/io_uring/Makefile index b3f1bd492804..bc4e4a3fa0a5 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ sync.o msg_ring.o advise.o openclose.o \ statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ - memmap.o alloc_cache.o + memmap.o alloc_cache.o query.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 6d57602304df..64b51e82baa2 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -11,6 +11,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "tctx.h" #include "poll.h" diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 3866fe6ff541..27a09aa4c9d0 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -4,6 +4,7 @@ #include <net/sock.h> #include "uring_cmd.h" +#include "io_uring.h" static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, @@ -73,7 +74,7 @@ static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk, cqe->user_data = 0; cqe->res = tskey; - cqe->flags = IORING_CQE_F_MORE; + cqe->flags = IORING_CQE_F_MORE | ctx_cqe32_flags(cmd_to_io_kiocb(cmd)->ctx); cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; if (ret == SOF_TIMESTAMPING_TX_HARDWARE) cqe->flags |= IORING_CQE_F_TSTAMP_HW; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 9798d6fb4ec7..ff3364531c77 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -9,7 +9,7 @@ #include <uapi/linux/io_uring.h> -#include "io_uring.h" +#include "filetable.h" #include "sqpoll.h" #include "fdinfo.h" #include "cancel.h" @@ -65,15 +65,12 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); - unsigned int cq_shift = 0; unsigned int sq_shift = 0; - unsigned int sq_entries, cq_entries; + unsigned int sq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; unsigned int i; - if (ctx->flags & IORING_SETUP_CQE32) - cq_shift = 1; if (ctx->flags & IORING_SETUP_SQE128) sq_shift = 1; @@ -125,18 +122,23 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "\n"); } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); - for (i = 0; i < cq_entries; i++) { - unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; + while (cq_head < cq_tail) { + struct io_uring_cqe *cqe; + bool cqe32 = false; + cqe = &r->cqes[(cq_head & cq_mask)]; + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) + cqe32 = true; seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", - entry & cq_mask, cqe->user_data, cqe->res, + cq_head & cq_mask, cqe->user_data, cqe->res, cqe->flags); - if (cq_shift) + if (cqe32) seq_printf(m, ", extra1:%llu, extra2:%llu\n", cqe->big_cqe[0], cqe->big_cqe[1]); seq_printf(m, "\n"); + cq_head++; + if (cqe32) + cq_head++; } if (ctx->flags & IORING_SETUP_SQPOLL) { diff --git a/io_uring/futex.c b/io_uring/futex.c index 9113a44984f3..64f3bd51c84c 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -43,7 +43,6 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { - req->async_data = NULL; hlist_del_init(&req->hash_node); io_req_task_complete(req, tw); } @@ -54,6 +53,7 @@ static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) io_tw_lock(ctx, tw); io_cache_free(&ctx->futex_cache, req->async_data); + io_req_async_data_clear(req, 0); __io_futex_complete(req, tw); } @@ -72,8 +72,7 @@ static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) io_req_set_res(req, res, 0); } - kfree(req->async_data); - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_free(req); __io_futex_complete(req, tw); } @@ -232,9 +231,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_unlock(ctx, issue_flags); req_set_fail(req); io_req_set_res(req, ret, 0); - kfree(futexv); - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_free(req); return IOU_COMPLETE; } @@ -310,9 +307,7 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; - kfree(ifd); + io_req_async_data_free(req); return IOU_COMPLETE; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93665cebe9bd..49ebdeb5b2d9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -79,6 +79,7 @@ #include "io-wq.h" +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" @@ -108,9 +109,6 @@ #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) - #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ @@ -179,6 +177,26 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif +static void io_poison_cached_req(struct io_kiocb *req) +{ + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; +} + +static void io_poison_req(struct io_kiocb *req) +{ + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; +} + static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -235,6 +253,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -594,27 +614,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -726,10 +748,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + bool is_cqe32 = false; - if (is_cqe32) + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; ocq_si |
