summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-11 14:19:23 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-11 14:19:23 -0800
commit4c72e2b8c42e57f65d8fbfb01329e79d2b450653 (patch)
tree93dda9fd8adf1c1948c8dd83809aff4f207c7ea5
parent01d550f0fcc06c7292f79a6f1453aac122d1d2c8 (diff)
parent6ff1407e24e6fdfa4a16ba9ba551e3d253a26391 (diff)
downloadlinux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.tar.gz
linux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.tar.bz2
linux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.zip
Merge tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "Mostly just come fixes and cleanups, but one feature as well. In detail: - Harden the check for handling IOPOLL based on return (Pavel) - Various minor optimizations (Pavel) - Drop remnants of SCM_RIGHTS fd passing support, now that it's no longer supported since 6.7 (me) - Fix for a case where bytes_done wasn't initialized properly on a failure condition for read/write requests (me) - Move the register related code to a separate file (me) - Add support for returning the provided ring buffer head (me) - Add support for adding a direct descriptor to the normal file table (me, Christian Brauner) - Fix for ensuring pending task_work for a ring with DEFER_TASKRUN is run even if we timeout waiting (me)" * tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux: io_uring: ensure local task_work is run on wait timeout io_uring/kbuf: add method for returning provided buffer ring head io_uring/rw: ensure io->bytes_done is always initialized io_uring: drop any code related to SCM_RIGHTS io_uring/unix: drop usage of io_uring socket io_uring/register: move io_uring_register(2) related code to register.c io_uring/openclose: add support for IORING_OP_FIXED_FD_INSTALL io_uring/cmd: inline io_uring_cmd_get_task io_uring/cmd: inline io_uring_cmd_do_in_task_lazy io_uring: split out cmd api into a separate header io_uring: optimise ltimeout for inline execution io_uring: don't check iopoll if request completes
-rw-r--r--MAINTAINERS1
-rw-r--r--drivers/block/ublk_drv.c2
-rw-r--r--drivers/nvme/host/ioctl.c2
-rw-r--r--include/linux/io_uring.h90
-rw-r--r--include/linux/io_uring/cmd.h77
-rw-r--r--include/linux/io_uring_types.h34
-rw-r--r--include/uapi/linux/io_uring.h19
-rw-r--r--io_uring/Makefile2
-rw-r--r--io_uring/filetable.c11
-rw-r--r--io_uring/io_uring.c663
-rw-r--r--io_uring/io_uring.h19
-rw-r--r--io_uring/kbuf.c26
-rw-r--r--io_uring/kbuf.h1
-rw-r--r--io_uring/opdef.c9
-rw-r--r--io_uring/openclose.c44
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/register.c605
-rw-r--r--io_uring/register.h8
-rw-r--r--io_uring/rsrc.c169
-rw-r--r--io_uring/rsrc.h15
-rw-r--r--io_uring/rw.c12
-rw-r--r--io_uring/uring_cmd.c15
-rw-r--r--net/core/scm.c2
-rw-r--r--net/unix/scm.c4
-rw-r--r--security/selinux/hooks.c2
-rw-r--r--security/smack/smack_lsm.c2
26 files changed, 890 insertions, 947 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 81f6b37e82c8..62a446462ef7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11142,6 +11142,7 @@ L: io-uring@vger.kernel.org
S: Maintained
T: git git://git.kernel.dk/linux-block
T: git git://git.kernel.dk/liburing
+F: include/linux/io_uring/
F: include/linux/io_uring.h
F: include/linux/io_uring_types.h
F: include/trace/events/io_uring.h
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 5aa729069bc1..1dfb2e77898b 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -36,7 +36,7 @@
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/cdev.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/blk-mq.h>
#include <linux/delay.h>
#include <linux/mm.h>
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 093e46f16e6c..18f5c1be5d67 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -5,7 +5,7 @@
*/
#include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include "nvme.h"
enum {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 9e6ce6d4ab51..68ed6697fece 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -6,66 +6,13 @@
#include <linux/xarray.h>
#include <uapi/linux/io_uring.h>
-enum io_uring_cmd_flags {
- IO_URING_F_COMPLETE_DEFER = 1,
- IO_URING_F_UNLOCKED = 2,
- /* the request is executed from poll, it should not be freed */
- IO_URING_F_MULTISHOT = 4,
- /* executed by io-wq */
- IO_URING_F_IOWQ = 8,
- /* int's last bit, sign checks are usually faster than a bit test */
- IO_URING_F_NONBLOCK = INT_MIN,
-
- /* ctx state flags, for URING_CMD */
- IO_URING_F_SQE128 = (1 << 8),
- IO_URING_F_CQE32 = (1 << 9),
- IO_URING_F_IOPOLL = (1 << 10),
-
- /* set when uring wants to cancel a previously issued command */
- IO_URING_F_CANCEL = (1 << 11),
- IO_URING_F_COMPAT = (1 << 12),
-};
-
-/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
-#define IORING_URING_CMD_CANCELABLE (1U << 30)
-
-struct io_uring_cmd {
- struct file *file;
- const struct io_uring_sqe *sqe;
- /* callback to defer completions to task context */
- void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
- u32 cmd_op;
- u32 flags;
- u8 pdu[32]; /* available inline for free use */
-};
-
-static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
-{
- return sqe->cmd;
-}
-
#if defined(CONFIG_IO_URING)
-int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
- struct iov_iter *iter, void *ioucmd);
-void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
- unsigned issue_flags);
-struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode);
-void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned),
- unsigned flags);
-/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned));
-
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
- __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
-}
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+bool io_is_uring_fops(struct file *file);
static inline void io_uring_files_cancel(void)
{
@@ -84,32 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk)
if (tsk->io_uring)
__io_uring_free(tsk);
}
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
-void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
- unsigned int issue_flags);
-struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
#else
-static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
- struct iov_iter *iter, void *ioucmd)
-{
- return -EOPNOTSUPP;
-}
-static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
- ssize_t ret2, unsigned issue_flags)
-{
-}
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
-static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
-static inline struct sock *io_uring_get_socket(struct file *file)
-{
- return NULL;
-}
static inline void io_uring_task_cancel(void)
{
}
@@ -128,13 +50,9 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
{
return -EOPNOTSUPP;
}
-static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
-}
-static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+static inline bool io_is_uring_fops(struct file *file)
{
- return NULL;
+ return false;
}
#endif
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
new file mode 100644
index 000000000000..e453a997c060
--- /dev/null
+++ b/include/linux/io_uring/cmd.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IO_URING_CMD_H
+#define _LINUX_IO_URING_CMD_H
+
+#include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
+
+/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_CANCELABLE (1U << 30)
+
+struct io_uring_cmd {
+ struct file *file;
+ const struct io_uring_sqe *sqe;
+ /* callback to defer completions to task context */
+ void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
+ u32 cmd_op;
+ u32 flags;
+ u8 pdu[32]; /* available inline for free use */
+};
+
+static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
+{
+ return sqe->cmd;
+}
+
+#if defined(CONFIG_IO_URING)
+int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+ struct iov_iter *iter, void *ioucmd);
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
+ unsigned issue_flags);
+void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+ unsigned flags);
+
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags);
+
+#else
+static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+ struct iov_iter *iter, void *ioucmd)
+{
+ return -EOPNOTSUPP;
+}
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+ ssize_t ret2, unsigned issue_flags)
+{
+}
+static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+ unsigned flags)
+{
+}
+static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+}
+#endif
+
+/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
+static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+ __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
+}
+
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+ __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
+}
+
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+ return cmd_to_io_kiocb(cmd)->task;
+}
+
+#endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 239a4f68801b..854ad67a5f70 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -7,6 +7,37 @@
#include <linux/llist.h>
#include <uapi/linux/io_uring.h>
+enum {
+ /*
+ * A hint to not wake right away but delay until there are enough of
+ * tw's queued to match the number of CQEs the task is waiting for.
+ *
+ * Must not be used wirh requests generating more than one CQE.
+ * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
+ */
+ IOU_F_TWQ_LAZY_WAKE = 1,
+};
+
+enum io_uring_cmd_flags {
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* the request is executed from poll, it should not be freed */
+ IO_URING_F_MULTISHOT = 4,
+ /* executed by io-wq */
+ IO_URING_F_IOWQ = 8,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
+
+ /* ctx state flags, for URING_CMD */
+ IO_URING_F_SQE128 = (1 << 8),
+ IO_URING_F_CQE32 = (1 << 9),
+ IO_URING_F_IOPOLL = (1 << 10),
+
+ /* set when uring wants to cancel a previously issued command */
+ IO_URING_F_CANCEL = (1 << 11),
+ IO_URING_F_COMPAT = (1 << 12),
+};
+
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -358,9 +389,6 @@ struct io_ring_ctx {
struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce;
- #if defined(CONFIG_UNIX)
- struct socket *ring_sock;
- #endif
/* hashed buffered write serialization */
struct io_wq_hash *hash_map;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1c16f817742..7a673b52827b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -71,6 +71,7 @@ struct io_uring_sqe {
__u32 uring_cmd_flags;
__u32 waitid_flags;
__u32 futex_flags;
+ __u32 install_fd_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -253,6 +254,7 @@ enum io_uring_op {
IORING_OP_FUTEX_WAIT,
IORING_OP_FUTEX_WAKE,
IORING_OP_FUTEX_WAITV,
+ IORING_OP_FIXED_FD_INSTALL,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -387,6 +389,13 @@ enum {
#define IORING_MSG_RING_FLAGS_PASS (1U << 1)
/*
+ * IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags)
+ *
+ * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC
+ */
+#define IORING_FIXED_FD_NO_CLOEXEC (1U << 0)
+
+/*
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
@@ -558,6 +567,9 @@ enum {
/* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
+ /* return status information for a buffer group */
+ IORING_REGISTER_PBUF_STATUS = 26,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -684,6 +696,13 @@ struct io_uring_buf_reg {
__u64 resv[3];
};
+/* argument for IORING_REGISTER_PBUF_STATUS */
+struct io_uring_buf_status {
+ __u32 buf_group; /* input */
+ __u32 head; /* output */
+ __u32 resv[8];
+};
+
/*
* io_uring_restriction->opcode values
*/
diff --git a/io_uring/Makefile b/io_uring/Makefile
index e5be47e4fc3b..2cdc51825405 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
- notif.o waitid.o
+ notif.o waitid.o register.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index e7d749991de4..6e86e6188dbe 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
io_file_bitmap_clear(&ctx->file_table, slot_index);
}
- ret = io_scm_file_account(ctx, file);
- if (!ret) {
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
- io_file_bitmap_set(&ctx->file_table, slot_index);
- }
- return ret;
+ *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, slot_index);
+ return 0;
}
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9839016f82ea..09b6d860deba 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -60,7 +60,6 @@
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
-#include <net/scm.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -70,6 +69,7 @@
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>
@@ -85,6 +85,7 @@
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
+#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
@@ -103,9 +104,6 @@
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
- IORING_REGISTER_LAST + IORING_OP_LAST)
-
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -129,11 +127,6 @@ enum {
IO_CHECK_CQ_DROPPED_BIT,
};
-enum {
- IO_EVENTFD_OP_SIGNAL_BIT,
- IO_EVENTFD_OP_FREE_BIT,
-};
-
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -177,19 +170,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
};
#endif
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
- if (io_is_uring_fops(file)) {
- struct io_ring_ctx *ctx = file->private_data;
-
- return ctx->ring_sock->sk;
- }
-#endif
- return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
@@ -554,8 +534,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-
-static void io_eventfd_ops(struct rcu_head *rcu)
+void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
@@ -1898,14 +1877,19 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
io_req_complete_defer(req);
else
io_req_complete_post(req, issue_flags);
- } else if (ret != IOU_ISSUE_SKIP_COMPLETE)
- return ret;
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
- io_iopoll_req_issued(req, issue_flags);
+ return 0;
+ }
- return 0;
+ if (ret == IOU_ISSUE_SKIP_COMPLETE) {
+ ret = 0;
+ io_arm_ltimeout(req);
+
+ /* If the op doesn't have a file, we're not polling for it */
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
+ io_iopoll_req_issued(req, issue_flags);
+ }
+ return ret;
}
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
@@ -2076,9 +2060,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret))
- io_arm_ltimeout(req);
- else
+ if (unlikely(ret))
io_queue_async(req, ret);
}
@@ -2633,8 +2615,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
__set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, 0);
- if (ret < 0)
- break;
/*
* Run task_work after scheduling and before io_should_wake().
* If we got woken because of task_work being processed, run it
@@ -2644,6 +2624,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!llist_empty(&ctx->work_llist))
io_run_local_work(ctx);
+ /*
+ * Non-local task_work will be run on exit to userspace, but
+ * if we're using DEFER_TASKRUN, then we could have waited
+ * with a timeout for a number of requests. If the timeout
+ * hits, we could have some requests ready to process. Ensure
+ * this break is _after_ we have run task_work, to avoid
+ * deferring running potentially pending requests until the
+ * next time we wait for events.
+ */
+ if (ret < 0)
+ break;
+
check_cq = READ_ONCE(ctx->check_cq);
if (unlikely(check_cq)) {
/* let the caller flush overflows, retry */
@@ -2831,61 +2823,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return off;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
-
- spin_lock(&ctx->completion_lock);
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
-
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- atomic_set(&ev_fd->refs, 1);
- atomic_set(&ev_fd->ops, 0);
- return 0;
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
- call_rcu(&ev_fd->rcu, io_eventfd_ops);
- return 0;
- }
-
- return -ENXIO;
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
@@ -2938,13 +2875,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_rsrc_node_destroy(ctx, ctx->rsrc_node);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
-
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- ctx->ring_sock->file = NULL; /* so that iput() is called */
- sock_release(ctx->ring_sock);
- }
-#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
@@ -2984,7 +2914,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
percpu_ref_put(&ctx->refs);
}
-static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
+__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
@@ -3043,19 +2973,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
return mask;
}
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- const struct cred *creds;
-
- creds = xa_erase(&ctx->personalities, id);
- if (creds) {
- put_cred(creds);
- return 0;
- }
-
- return -EINVAL;
-}
-
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
@@ -3866,32 +3783,12 @@ static int io_uring_install_fd(struct file *file)
/*
* Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
+ * fd to gain access to the SQ/CQ ring details.
*/
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
- struct file *file;
-#if defined(CONFIG_UNIX)
- int ret;
-
- ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
- &ctx->ring_sock);
- if (ret)
- return ERR_PTR(ret);
-#endif
-
- file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL);
-#if defined(CONFIG_UNIX)
- if (IS_ERR(file)) {
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
- } else {
- ctx->ring_sock->file = file;
- }
-#endif
- return file;
}
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
@@ -4158,506 +4055,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_probe *p;
- size_t size;
- int i, ret;
-
- size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
- ret = -EINVAL;
- if (memchr_inv(p, 0, size))
- goto out;
-
- p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
-
- for (i = 0; i < nr_args; i++) {
- p->ops[i].op = i;
- if (!io_issue_defs[i].not_supported)
- p->ops[i].flags = IO_URING_OP_SUPPORTED;
- }
- p->ops_len = i;
-
- ret = 0;
- if (copy_to_user(arg, p, size))
- ret = -EFAULT;
-out:
- kfree(p);
- return ret;
-}
-
-static int io_register_personality(struct io_ring_ctx *ctx)
-{
- const struct cred *creds;
- u32 id;
- int ret;
-
- creds = get_current_cred();
-
- ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
- XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (ret < 0) {
- put_cred(creds);
- return ret;
- }
- return id;
-}
-
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
-{
- struct io_uring_restriction *res;
- size_t size;
- int i, ret;
-
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
- if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
- return -EINVAL;
-
- size = array_size(nr_args, sizeof(*res));
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- res = memdup_user(arg, size);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- ret = 0;
-
- for (i = 0; i < nr_args; i++) {
- switch (res[i].opcode) {
- case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
- break;
- case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
- break;
- case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
- break;
- case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
-
-out:
- /* Reset all restrictions if an error happened */
- if (ret != 0)
- memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
- else
- ctx->restrictions.registered = true;
-
- kfree(res);
- return ret;
-}
-
-static int io_register_enable_rings(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
- WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
- /*
- * Lazy activation attempts would fail if it was polled before
- * submitter_task is set.
- */
- if (wq_has_sleeper(&ctx->poll_wq))
- io_activate_pollwq(ctx);
- }
-
- if (ctx->restrictions.registered)
- ctx->restricted = 1;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- return 0;
-}
-
-static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
- cpumask_var_t new_mask)
-{
- int ret;
-
- if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_wq_cpu_affinity(current->io_uring, new_mask);
- } else {
- mutex_unlock(&ctx->uring_lock);
- ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
- mutex_lock(&ctx->uring_lock);
- }
-
- return ret;
-}
-
-static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
- void __user *arg, unsigned len)
-{
- cpumask_var_t new_mask;
- int ret;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_clear(new_mask);
- if (len > cpumask_size())
- len = cpumask_size();
-
- if (in_compat_syscall()) {
- ret = compat_get_bitmap(cpumask_bits(new_mask),
- (const compat_ulong_t __user *)arg,
- len * 8 /* CHAR_BIT */);
- } else {
- ret = copy_from_user(new_mask, arg, len);
- }
-
- if (ret) {
- free_cpumask_var(new_mask);
- return -EFAULT;
- }
-
- ret = __io_register