diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-11 14:19:23 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-11 14:19:23 -0800 |
| commit | 4c72e2b8c42e57f65d8fbfb01329e79d2b450653 (patch) | |
| tree | 93dda9fd8adf1c1948c8dd83809aff4f207c7ea5 | |
| parent | 01d550f0fcc06c7292f79a6f1453aac122d1d2c8 (diff) | |
| parent | 6ff1407e24e6fdfa4a16ba9ba551e3d253a26391 (diff) | |
| download | linux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.tar.gz linux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.tar.bz2 linux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.zip | |
Merge tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
"Mostly just come fixes and cleanups, but one feature as well. In
detail:
- Harden the check for handling IOPOLL based on return (Pavel)
- Various minor optimizations (Pavel)
- Drop remnants of SCM_RIGHTS fd passing support, now that it's no
longer supported since 6.7 (me)
- Fix for a case where bytes_done wasn't initialized properly on a
failure condition for read/write requests (me)
- Move the register related code to a separate file (me)
- Add support for returning the provided ring buffer head (me)
- Add support for adding a direct descriptor to the normal file table
(me, Christian Brauner)
- Fix for ensuring pending task_work for a ring with DEFER_TASKRUN is
run even if we timeout waiting (me)"
* tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux:
io_uring: ensure local task_work is run on wait timeout
io_uring/kbuf: add method for returning provided buffer ring head
io_uring/rw: ensure io->bytes_done is always initialized
io_uring: drop any code related to SCM_RIGHTS
io_uring/unix: drop usage of io_uring socket
io_uring/register: move io_uring_register(2) related code to register.c
io_uring/openclose: add support for IORING_OP_FIXED_FD_INSTALL
io_uring/cmd: inline io_uring_cmd_get_task
io_uring/cmd: inline io_uring_cmd_do_in_task_lazy
io_uring: split out cmd api into a separate header
io_uring: optimise ltimeout for inline execution
io_uring: don't check iopoll if request completes
| -rw-r--r-- | MAINTAINERS | 1 | ||||
| -rw-r--r-- | drivers/block/ublk_drv.c | 2 | ||||
| -rw-r--r-- | drivers/nvme/host/ioctl.c | 2 | ||||
| -rw-r--r-- | include/linux/io_uring.h | 90 | ||||
| -rw-r--r-- | include/linux/io_uring/cmd.h | 77 | ||||
| -rw-r--r-- | include/linux/io_uring_types.h | 34 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 19 | ||||
| -rw-r--r-- | io_uring/Makefile | 2 | ||||
| -rw-r--r-- | io_uring/filetable.c | 11 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 663 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 19 | ||||
| -rw-r--r-- | io_uring/kbuf.c | 26 | ||||
| -rw-r--r-- | io_uring/kbuf.h | 1 | ||||
| -rw-r--r-- | io_uring/opdef.c | 9 | ||||
| -rw-r--r-- | io_uring/openclose.c | 44 | ||||
| -rw-r--r-- | io_uring/openclose.h | 3 | ||||
| -rw-r--r-- | io_uring/register.c | 605 | ||||
| -rw-r--r-- | io_uring/register.h | 8 | ||||
| -rw-r--r-- | io_uring/rsrc.c | 169 | ||||
| -rw-r--r-- | io_uring/rsrc.h | 15 | ||||
| -rw-r--r-- | io_uring/rw.c | 12 | ||||
| -rw-r--r-- | io_uring/uring_cmd.c | 15 | ||||
| -rw-r--r-- | net/core/scm.c | 2 | ||||
| -rw-r--r-- | net/unix/scm.c | 4 | ||||
| -rw-r--r-- | security/selinux/hooks.c | 2 | ||||
| -rw-r--r-- | security/smack/smack_lsm.c | 2 |
26 files changed, 890 insertions, 947 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 81f6b37e82c8..62a446462ef7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11142,6 +11142,7 @@ L: io-uring@vger.kernel.org S: Maintained T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/liburing +F: include/linux/io_uring/ F: include/linux/io_uring.h F: include/linux/io_uring_types.h F: include/trace/events/io_uring.h diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5aa729069bc1..1dfb2e77898b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -36,7 +36,7 @@ #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/cdev.h> -#include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <linux/blk-mq.h> #include <linux/delay.h> #include <linux/mm.h> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 093e46f16e6c..18f5c1be5d67 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -5,7 +5,7 @@ */ #include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/nvme_ioctl.h> -#include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include "nvme.h" enum { diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 9e6ce6d4ab51..68ed6697fece 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -6,66 +6,13 @@ #include <linux/xarray.h> #include <uapi/linux/io_uring.h> -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* the request is executed from poll, it should not be freed */ - IO_URING_F_MULTISHOT = 4, - /* executed by io-wq */ - IO_URING_F_IOWQ = 8, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, - - /* ctx state flags, for URING_CMD */ - IO_URING_F_SQE128 = (1 << 8), - IO_URING_F_CQE32 = (1 << 9), - IO_URING_F_IOPOLL = (1 << 10), - - /* set when uring wants to cancel a previously issued command */ - IO_URING_F_CANCEL = (1 << 11), - IO_URING_F_COMPAT = (1 << 12), -}; - -/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ -#define IORING_URING_CMD_CANCELABLE (1U << 30) - -struct io_uring_cmd { - struct file *file; - const struct io_uring_sqe *sqe; - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); - u32 cmd_op; - u32 flags; - u8 pdu[32]; /* available inline for free use */ -}; - -static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -{ - return sqe->cmd; -} - #if defined(CONFIG_IO_URING) -int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, - unsigned issue_flags); -struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); -void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), - unsigned flags); -/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); - -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); -} +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); +bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { @@ -84,32 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); -void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) -{ - return -EOPNOTSUPP; -} -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2, unsigned issue_flags) -{ -} -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline struct sock *io_uring_get_socket(struct file *file) -{ - return NULL; -} static inline void io_uring_task_cancel(void) { } @@ -128,13 +50,9 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ -} -static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +static inline bool io_is_uring_fops(struct file *file) { - return NULL; + return false; } #endif diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h new file mode 100644 index 000000000000..e453a997c060 --- /dev/null +++ b/include/linux/io_uring/cmd.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IO_URING_CMD_H +#define _LINUX_IO_URING_CMD_H + +#include <uapi/linux/io_uring.h> +#include <linux/io_uring_types.h> + +/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ +#define IORING_URING_CMD_CANCELABLE (1U << 30) + +struct io_uring_cmd { + struct file *file; + const struct io_uring_sqe *sqe; + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + u32 cmd_op; + u32 flags; + u8 pdu[32]; /* available inline for free use */ +}; + +static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return sqe->cmd; +} + +#if defined(CONFIG_IO_URING) +int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd); +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, + unsigned issue_flags); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); + +void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags); + +#else +static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd) +{ + return -EOPNOTSUPP; +} +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2, unsigned issue_flags) +{ +} +static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags) +{ +} +static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ +} +#endif + +/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); +} + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} + +static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->task; +} + +#endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 239a4f68801b..854ad67a5f70 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,37 @@ #include <linux/llist.h> #include <uapi/linux/io_uring.h> +enum { + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 1, +}; + +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* the request is executed from poll, it should not be freed */ + IO_URING_F_MULTISHOT = 4, + /* executed by io-wq */ + IO_URING_F_IOWQ = 8, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = (1 << 8), + IO_URING_F_CQE32 = (1 << 9), + IO_URING_F_IOPOLL = (1 << 10), + + /* set when uring wants to cancel a previously issued command */ + IO_URING_F_CANCEL = (1 << 11), + IO_URING_F_COMPAT = (1 << 12), +}; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -358,9 +389,6 @@ struct io_ring_ctx { struct wait_queue_head rsrc_quiesce_wq; unsigned rsrc_quiesce; - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif /* hashed buffered write serialization */ struct io_wq_hash *hash_map; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1c16f817742..7a673b52827b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -71,6 +71,7 @@ struct io_uring_sqe { __u32 uring_cmd_flags; __u32 waitid_flags; __u32 futex_flags; + __u32 install_fd_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -253,6 +254,7 @@ enum io_uring_op { IORING_OP_FUTEX_WAIT, IORING_OP_FUTEX_WAKE, IORING_OP_FUTEX_WAITV, + IORING_OP_FIXED_FD_INSTALL, /* this goes last, obviously */ IORING_OP_LAST, @@ -387,6 +389,13 @@ enum { #define IORING_MSG_RING_FLAGS_PASS (1U << 1) /* + * IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags) + * + * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC + */ +#define IORING_FIXED_FD_NO_CLOEXEC (1U << 0) + +/* * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { @@ -558,6 +567,9 @@ enum { /* register a range of fixed file slots for automatic slot allocation */ IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* return status information for a buffer group */ + IORING_REGISTER_PBUF_STATUS = 26, + /* this goes last */ IORING_REGISTER_LAST, @@ -684,6 +696,13 @@ struct io_uring_buf_reg { __u64 resv[3]; }; +/* argument for IORING_REGISTER_PBUF_STATUS */ +struct io_uring_buf_status { + __u32 buf_group; /* input */ + __u32 head; /* output */ + __u32 resv[8]; +}; + /* * io_uring_restriction->opcode values */ diff --git a/io_uring/Makefile b/io_uring/Makefile index e5be47e4fc3b..2cdc51825405 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o + notif.o waitid.o register.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/filetable.c b/io_uring/filetable.c index e7d749991de4..6e86e6188dbe 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, io_file_bitmap_clear(&ctx->file_table, slot_index); } - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } - return ret; + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); + return 0; } int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9839016f82ea..09b6d860deba 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -60,7 +60,6 @@ #include <linux/net.h> #include <net/sock.h> #include <net/af_unix.h> -#include <net/scm.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -70,6 +69,7 @@ #include <linux/fadvise.h> #include <linux/task_work.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> #include <asm/shmparam.h> @@ -85,6 +85,7 @@ #include "opdef.h" #include "refs.h" #include "tctx.h" +#include "register.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" @@ -103,9 +104,6 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ - IORING_REGISTER_LAST + IORING_OP_LAST) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -129,11 +127,6 @@ enum { IO_CHECK_CQ_DROPPED_BIT, }; -enum { - IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, -}; - struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -177,19 +170,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif -struct sock *io_uring_get_socket(struct file *file) -{ -#if defined(CONFIG_UNIX) - if (io_is_uring_fops(file)) { - struct io_ring_ctx *ctx = file->private_data; - - return ctx->ring_sock->sk; - } -#endif - return NULL; -} -EXPORT_SYMBOL(io_uring_get_socket); - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || @@ -554,8 +534,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } - -static void io_eventfd_ops(struct rcu_head *rcu) +void io_eventfd_ops(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); int ops = atomic_xchg(&ev_fd->ops, 0); @@ -1898,14 +1877,19 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) io_req_complete_defer(req); else io_req_complete_post(req, issue_flags); - } else if (ret != IOU_ISSUE_SKIP_COMPLETE) - return ret; - /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) - io_iopoll_req_issued(req, issue_flags); + return 0; + } - return 0; + if (ret == IOU_ISSUE_SKIP_COMPLETE) { + ret = 0; + io_arm_ltimeout(req); + + /* If the op doesn't have a file, we're not polling for it */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + io_iopoll_req_issued(req, issue_flags); + } + return ret; } int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) @@ -2076,9 +2060,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) - io_arm_ltimeout(req); - else + if (unlikely(ret)) io_queue_async(req, ret); } @@ -2633,8 +2615,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, 0); - if (ret < 0) - break; /* * Run task_work after scheduling and before io_should_wake(). * If we got woken because of task_work being processed, run it @@ -2644,6 +2624,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx); + /* + * Non-local task_work will be run on exit to userspace, but + * if we're using DEFER_TASKRUN, then we could have waited + * with a timeout for a number of requests. If the timeout + * hits, we could have some requests ready to process. Ensure + * this break is _after_ we have run task_work, to avoid + * deferring running potentially pending requests until the + * next time we wait for events. + */ + if (ret < 0) + break; + check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { /* let the caller flush overflows, retry */ @@ -2831,61 +2823,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return off; } -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); - return 0; -} - -static int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); - return 0; - } - - return -ENXIO; -} - static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; @@ -2938,13 +2875,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_rsrc_node_destroy(ctx, ctx->rsrc_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - ctx->ring_sock->file = NULL; /* so that iput() is called */ - sock_release(ctx->ring_sock); - } -#endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); @@ -2984,7 +2914,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) percpu_ref_put(&ctx->refs); } -static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) +__cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ @@ -3043,19 +2973,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) return mask; } -static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) -{ - const struct cred *creds; - - creds = xa_erase(&ctx->personalities, id); - if (creds) { - put_cred(creds); - return 0; - } - - return -EINVAL; -} - struct io_tctx_exit { struct callback_head task_work; struct completion completion; @@ -3866,32 +3783,12 @@ static int io_uring_install_fd(struct file *file) /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this - * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, - * we have to tie this fd to a socket for file garbage collection purposes. + * fd to gain access to the SQ/CQ ring details. */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { - struct file *file; -#if defined(CONFIG_UNIX) - int ret; - - ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, - &ctx->ring_sock); - if (ret) - return ERR_PTR(ret); -#endif - - file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, + return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); -#if defined(CONFIG_UNIX) - if (IS_ERR(file)) { - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; - } else { - ctx->ring_sock->file = file; - } -#endif - return file; } static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, @@ -4158,506 +4055,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } -static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_probe *p; - size_t size; - int i, ret; - - size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; - ret = -EINVAL; - if (memchr_inv(p, 0, size)) - goto out; - - p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; - - for (i = 0; i < nr_args; i++) { - p->ops[i].op = i; - if (!io_issue_defs[i].not_supported) - p->ops[i].flags = IO_URING_OP_SUPPORTED; - } - p->ops_len = i; - - ret = 0; - if (copy_to_user(arg, p, size)) - ret = -EFAULT; -out: - kfree(p); - return ret; -} - -static int io_register_personality(struct io_ring_ctx *ctx) -{ - const struct cred *creds; - u32 id; - int ret; - - creds = get_current_cred(); - - ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, - XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); - if (ret < 0) { - put_cred(creds); - return ret; - } - return id; -} - -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) -{ - struct io_uring_restriction *res; - size_t size; - int i, ret; - - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) - return -EINVAL; - - size = array_size(nr_args, sizeof(*res)); - if (size == SIZE_MAX) - return -EOVERFLOW; - - res = memdup_user(arg, size); - if (IS_ERR(res)) - return PTR_ERR(res); - - ret = 0; - - for (i = 0; i < nr_args; i++) { - switch (res[i].opcode) { - case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); - break; - case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); - break; - case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; - break; - case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; - break; - default: - ret = -EINVAL; - goto out; - } - } - -out: - /* Reset all restrictions if an error happened */ - if (ret != 0) - memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); - else - ctx->restrictions.registered = true; - - kfree(res); - return ret; -} - -static int io_register_enable_rings(struct io_ring_ctx *ctx) -{ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); - /* - * Lazy activation attempts would fail if it was polled before - * submitter_task is set. - */ - if (wq_has_sleeper(&ctx->poll_wq)) - io_activate_pollwq(ctx); - } - - if (ctx->restrictions.registered) - ctx->restricted = 1; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); - return 0; -} - -static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, - cpumask_var_t new_mask) -{ - int ret; - - if (!(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_wq_cpu_affinity(current->io_uring, new_mask); - } else { - mutex_unlock(&ctx->uring_lock); - ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); - mutex_lock(&ctx->uring_lock); - } - - return ret; -} - -static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, - void __user *arg, unsigned len) -{ - cpumask_var_t new_mask; - int ret; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_clear(new_mask); - if (len > cpumask_size()) - len = cpumask_size(); - - if (in_compat_syscall()) { - ret = compat_get_bitmap(cpumask_bits(new_mask), - (const compat_ulong_t __user *)arg, - len * 8 /* CHAR_BIT */); - } else { - ret = copy_from_user(new_mask, arg, len); - } - - if (ret) { - free_cpumask_var(new_mask); - return -EFAULT; - } - - ret = __io_register |
