summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-11 14:19:23 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-11 14:19:23 -0800
commit4c72e2b8c42e57f65d8fbfb01329e79d2b450653 (patch)
tree93dda9fd8adf1c1948c8dd83809aff4f207c7ea5 /io_uring
parent01d550f0fcc06c7292f79a6f1453aac122d1d2c8 (diff)
parent6ff1407e24e6fdfa4a16ba9ba551e3d253a26391 (diff)
downloadlinux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.tar.gz
linux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.tar.bz2
linux-4c72e2b8c42e57f65d8fbfb01329e79d2b450653.zip
Merge tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "Mostly just come fixes and cleanups, but one feature as well. In detail: - Harden the check for handling IOPOLL based on return (Pavel) - Various minor optimizations (Pavel) - Drop remnants of SCM_RIGHTS fd passing support, now that it's no longer supported since 6.7 (me) - Fix for a case where bytes_done wasn't initialized properly on a failure condition for read/write requests (me) - Move the register related code to a separate file (me) - Add support for returning the provided ring buffer head (me) - Add support for adding a direct descriptor to the normal file table (me, Christian Brauner) - Fix for ensuring pending task_work for a ring with DEFER_TASKRUN is run even if we timeout waiting (me)" * tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux: io_uring: ensure local task_work is run on wait timeout io_uring/kbuf: add method for returning provided buffer ring head io_uring/rw: ensure io->bytes_done is always initialized io_uring: drop any code related to SCM_RIGHTS io_uring/unix: drop usage of io_uring socket io_uring/register: move io_uring_register(2) related code to register.c io_uring/openclose: add support for IORING_OP_FIXED_FD_INSTALL io_uring/cmd: inline io_uring_cmd_get_task io_uring/cmd: inline io_uring_cmd_do_in_task_lazy io_uring: split out cmd api into a separate header io_uring: optimise ltimeout for inline execution io_uring: don't check iopoll if request completes
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile2
-rw-r--r--io_uring/filetable.c11
-rw-r--r--io_uring/io_uring.c663
-rw-r--r--io_uring/io_uring.h19
-rw-r--r--io_uring/kbuf.c26
-rw-r--r--io_uring/kbuf.h1
-rw-r--r--io_uring/opdef.c9
-rw-r--r--io_uring/openclose.c44
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/register.c605
-rw-r--r--io_uring/register.h8
-rw-r--r--io_uring/rsrc.c169
-rw-r--r--io_uring/rsrc.h15
-rw-r--r--io_uring/rw.c12
-rw-r--r--io_uring/uring_cmd.c15
15 files changed, 752 insertions, 850 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index e5be47e4fc3b..2cdc51825405 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
- notif.o waitid.o
+ notif.o waitid.o register.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index e7d749991de4..6e86e6188dbe 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
io_file_bitmap_clear(&ctx->file_table, slot_index);
}
- ret = io_scm_file_account(ctx, file);
- if (!ret) {
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
- io_file_bitmap_set(&ctx->file_table, slot_index);
- }
- return ret;
+ *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, slot_index);
+ return 0;
}
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9839016f82ea..09b6d860deba 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -60,7 +60,6 @@
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
-#include <net/scm.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -70,6 +69,7 @@
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>
@@ -85,6 +85,7 @@
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
+#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
@@ -103,9 +104,6 @@
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
- IORING_REGISTER_LAST + IORING_OP_LAST)
-
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -129,11 +127,6 @@ enum {
IO_CHECK_CQ_DROPPED_BIT,
};
-enum {
- IO_EVENTFD_OP_SIGNAL_BIT,
- IO_EVENTFD_OP_FREE_BIT,
-};
-
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -177,19 +170,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
};
#endif
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
- if (io_is_uring_fops(file)) {
- struct io_ring_ctx *ctx = file->private_data;
-
- return ctx->ring_sock->sk;
- }
-#endif
- return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
@@ -554,8 +534,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-
-static void io_eventfd_ops(struct rcu_head *rcu)
+void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
@@ -1898,14 +1877,19 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
io_req_complete_defer(req);
else
io_req_complete_post(req, issue_flags);
- } else if (ret != IOU_ISSUE_SKIP_COMPLETE)
- return ret;
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
- io_iopoll_req_issued(req, issue_flags);
+ return 0;
+ }
- return 0;
+ if (ret == IOU_ISSUE_SKIP_COMPLETE) {
+ ret = 0;
+ io_arm_ltimeout(req);
+
+ /* If the op doesn't have a file, we're not polling for it */
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
+ io_iopoll_req_issued(req, issue_flags);
+ }
+ return ret;
}
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
@@ -2076,9 +2060,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret))
- io_arm_ltimeout(req);
- else
+ if (unlikely(ret))
io_queue_async(req, ret);
}
@@ -2633,8 +2615,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
__set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, 0);
- if (ret < 0)
- break;
/*
* Run task_work after scheduling and before io_should_wake().
* If we got woken because of task_work being processed, run it
@@ -2644,6 +2624,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!llist_empty(&ctx->work_llist))
io_run_local_work(ctx);
+ /*
+ * Non-local task_work will be run on exit to userspace, but
+ * if we're using DEFER_TASKRUN, then we could have waited
+ * with a timeout for a number of requests. If the timeout
+ * hits, we could have some requests ready to process. Ensure
+ * this break is _after_ we have run task_work, to avoid
+ * deferring running potentially pending requests until the
+ * next time we wait for events.
+ */
+ if (ret < 0)
+ break;
+
check_cq = READ_ONCE(ctx->check_cq);
if (unlikely(check_cq)) {
/* let the caller flush overflows, retry */
@@ -2831,61 +2823,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return off;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
-
- spin_lock(&ctx->completion_lock);
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
-
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- atomic_set(&ev_fd->refs, 1);
- atomic_set(&ev_fd->ops, 0);
- return 0;
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
- call_rcu(&ev_fd->rcu, io_eventfd_ops);
- return 0;
- }
-
- return -ENXIO;
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
@@ -2938,13 +2875,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_rsrc_node_destroy(ctx, ctx->rsrc_node);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
-
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- ctx->ring_sock->file = NULL; /* so that iput() is called */
- sock_release(ctx->ring_sock);
- }
-#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
@@ -2984,7 +2914,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
percpu_ref_put(&ctx->refs);
}
-static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
+__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
@@ -3043,19 +2973,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
return mask;
}
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- const struct cred *creds;
-
- creds = xa_erase(&ctx->personalities, id);
- if (creds) {
- put_cred(creds);
- return 0;
- }
-
- return -EINVAL;
-}
-
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
@@ -3866,32 +3783,12 @@ static int io_uring_install_fd(struct file *file)
/*
* Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
+ * fd to gain access to the SQ/CQ ring details.
*/
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
- struct file *file;
-#if defined(CONFIG_UNIX)
- int ret;
-
- ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
- &ctx->ring_sock);
- if (ret)
- return ERR_PTR(ret);
-#endif
-
- file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL);
-#if defined(CONFIG_UNIX)
- if (IS_ERR(file)) {
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
- } else {
- ctx->ring_sock->file = file;
- }
-#endif
- return file;
}
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
@@ -4158,506 +4055,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_probe *p;
- size_t size;
- int i, ret;
-
- size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
- ret = -EINVAL;
- if (memchr_inv(p, 0, size))
- goto out;
-
- p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
-
- for (i = 0; i < nr_args; i++) {
- p->ops[i].op = i;
- if (!io_issue_defs[i].not_supported)
- p->ops[i].flags = IO_URING_OP_SUPPORTED;
- }
- p->ops_len = i;
-
- ret = 0;
- if (copy_to_user(arg, p, size))
- ret = -EFAULT;
-out:
- kfree(p);
- return ret;
-}
-
-static int io_register_personality(struct io_ring_ctx *ctx)
-{
- const struct cred *creds;
- u32 id;
- int ret;
-
- creds = get_current_cred();
-
- ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
- XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (ret < 0) {
- put_cred(creds);
- return ret;
- }
- return id;
-}
-
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
-{
- struct io_uring_restriction *res;
- size_t size;
- int i, ret;
-
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
- if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
- return -EINVAL;
-
- size = array_size(nr_args, sizeof(*res));
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- res = memdup_user(arg, size);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- ret = 0;
-
- for (i = 0; i < nr_args; i++) {
- switch (res[i].opcode) {
- case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
- break;
- case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
- break;
- case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
- break;
- case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
-
-out:
- /* Reset all restrictions if an error happened */
- if (ret != 0)
- memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
- else
- ctx->restrictions.registered = true;
-
- kfree(res);
- return ret;
-}
-
-static int io_register_enable_rings(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
- WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
- /*
- * Lazy activation attempts would fail if it was polled before
- * submitter_task is set.
- */
- if (wq_has_sleeper(&ctx->poll_wq))
- io_activate_pollwq(ctx);
- }
-
- if (ctx->restrictions.registered)
- ctx->restricted = 1;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- return 0;
-}
-
-static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
- cpumask_var_t new_mask)
-{
- int ret;
-
- if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_wq_cpu_affinity(current->io_uring, new_mask);
- } else {
- mutex_unlock(&ctx->uring_lock);
- ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
- mutex_lock(&ctx->uring_lock);
- }
-
- return ret;
-}
-
-static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
- void __user *arg, unsigned len)
-{
- cpumask_var_t new_mask;
- int ret;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_clear(new_mask);
- if (len > cpumask_size())
- len = cpumask_size();
-
- if (in_compat_syscall()) {
- ret = compat_get_bitmap(cpumask_bits(new_mask),
- (const compat_ulong_t __user *)arg,
- len * 8 /* CHAR_BIT */);
- } else {
- ret = copy_from_user(new_mask, arg, len);
- }
-
- if (ret) {
- free_cpumask_var(new_mask);
- return -EFAULT;
- }
-
- ret = __io_register_iowq_aff(ctx, new_mask);
- free_cpumask_var(new_mask);
- return ret;
-}
-
-static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
-{
- return __io_register_iowq_aff(ctx, NULL);
-}
-
-static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
- __must_hold(&ctx->uring_lock)
-{
- struct io_tctx_node *node;
- struct io_uring_task *tctx = NULL;
- struct io_sq_data *sqd = NULL;
- __u32 new_count[2];
- int i, ret;
-
- if (copy_from_user(new_count, arg, sizeof(new_count)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i] > INT_MAX)
- return -EINVAL;
-
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- sqd = ctx->sq_data;
- if (sqd) {
- /*
- * Observe the correct sqd->lock -> ctx->uring_lock
- * ordering. Fine to drop uring_lock here, we hold
- * a ref to the ctx.
- */
- refcount_inc(&sqd->refs);
- mutex_unlock(&ctx->uring_lock);
- mutex_lock(&sqd->lock);
- mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
- }
- } else {
- tctx = current->io_uring;
- }
-
- BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i])
- ctx->iowq_limits[i] = new_count[i];
- ctx->iowq_limits_set = true;
-
- if (tctx && tctx->io_wq) {
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
- } else {
- memset(new_count, 0, sizeof(new_count));
- }
-
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
-
- if (copy_to_user(arg, new_count, sizeof(new_count)))
- return -EFAULT;
-
- /* that's it for SQPOLL, only the SQPOLL task creates requests */
- if (sqd)
- return 0;
-
- /* now propagate the restriction to all registered users */
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- if (WARN_ON_ONCE(!tctx->io_wq))
- continue;
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- new_count[i] = ctx->iowq_limits[i];
- /* ignore errors, it always returns zero anyway */
- (void)io_wq_max_workers(tctx->io_wq, new_count);
- }
- return 0;
-err:
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
- return ret;
-}
-
-static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
- void __user *arg, unsigned nr_args)
- __releases(ctx->uring_lock)
- __acquires(ctx->uring_lock)
-{
- int ret;
-
- /*
- * We don't quiesce the refs for register anymore and so it can't be
- * dying as we're holding a file ref here.
- */
- if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
- return -ENXIO;
-
- if (ctx->submitter_task && ctx->submitter_task != current)
- return -EEXIST;
-
- if (ctx->restricted) {
- opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
- if (!test_bit(opcode, ctx->restrictions.register_op))
- return -EACCES;
- }
-
- switch (opcode) {
- case IORING_REGISTER_BUFFERS:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_BUFFERS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_buffers_unregister(ctx);
- break;
- case IORING_REGISTER_FILES:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_FILES:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_files_unregister(ctx);
- break;
- case IORING_REGISTER_FILES_UPDATE:
- ret = io_register_files_update(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_EVENTFD:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 0);
- break;
- case IORING_REGISTER_EVENTFD_ASYNC:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 1);
- break;
- case IORING_UNREGISTER_EVENTFD:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_eventfd_unregister(ctx);
- break;
- case IORING_REGISTER_PROBE:
- ret = -EINVAL;
- if (!arg || nr_args > 256)
- break;
- ret = io_probe(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_personality(ctx);
- break;
- case IORING_UNREGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg)
- break;
- ret = io_unregister_personality(ctx, nr_args);
- break;
- case IORING_REGISTER_ENABLE_RINGS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_enable_rings(ctx);
- break;
- case IORING_REGISTER_RESTRICTIONS:
- ret = io_register_restrictions(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_FILES2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_FILES_UPDATE2:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_BUFFERS2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_BUFFERS_UPDATE:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (!arg || !nr_args)
- break;
- ret = io_register_iowq_aff(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_unregister_iowq_aff(ctx);
- break;
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- ret = -EINVAL;
- if (!arg || nr_args != 2)
- break;
- ret = io_register_iowq_max_workers(ctx, arg);
- break;
- case IORING_REGISTER_RING_FDS:
- ret = io_ringfd_register(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_RING_FDS:
- ret = io_ringfd_unregister(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_register_pbuf_ring(ctx, arg);
- break;
- case IORING_UNREGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_unregister_pbuf_ring(ctx, arg);
- break;
- case IORING_REGISTER_SYNC_CANCEL:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_sync_cancel(ctx, arg);
- break;
- case IORING_REGISTER_FILE_ALLOC_RANGE:
- ret = -EINVAL;
- if (!arg || nr_args)
- break;
- ret = io_register_file_alloc_range(ctx, arg);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
- void __user *, arg, unsigned int, nr_args)
-{
- struct io_ring_ctx *ctx;
- long ret = -EBADF;
- struct file *file;
- bool use_registered_ring;
-
- use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
- opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
-
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
-
- if (use_registered_ring) {
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- struct io_uring_task *tctx = current->io_uring;
-
- if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
- return -EINVAL;
- fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- file = tctx->registered_rings[fd];
- if (unlikely(!file))
- return -EBADF;
- } else {
- file = fget(fd);
- if (unlikely(!file))
- return -EBADF;
- ret = -EOPNOTSUPP;
- if (!io_is_uring_fops(file))
- goto out_fput;
- }
-
- ctx = file->private_data;
-
- mutex_lock(&ctx->uring_lock);
- ret = __io_uring_register(ctx, opcode, arg, nr_args);
- mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
-out_fput:
- if (!use_registered_ring)
- fput(file);
- return ret;
-}
-
static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ed84f2737b3a..04e33f25919c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -15,16 +15,6 @@
#include <trace/events/io_uring.h>
#endif
-enum {
- /*
- * A hint to not wake right away but delay until there are enough of
- * tw's queued to match the number of CQEs the task is waiting for.
- *
- * Must not be used wirh requests generating more than one CQE.
- * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
- */
- IOU_F_TWQ_LAZY_WAKE = 1,
-};
enum {
IOU_OK = 0,
@@ -54,7 +44,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
-bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
@@ -89,6 +78,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr);
+enum {
+ IO_EVENTFD_OP_SIGNAL_BIT,
+ IO_EVENTFD_OP_FREE_BIT,
+};
+
+void io_eventfd_ops(struct rcu_head *rcu);
+void io_activate_pollwq(struct io_ring_ctx *ctx);
+
#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 72b6af1d2ed3..18df5a9d2f5e 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -750,6 +750,32 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
+int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_status buf_status;
+ struct io_buffer_list *bl;
+ int i;
+
+ if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
+ return -EFAULT;
+
+ for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
+ if (buf_status.resv[i])
+ return -EINVAL;
+
+ bl = io_buffer_get_list(ctx, buf_status.buf_group);
+ if (!bl)
+ return -ENOENT;
+ if (!bl->is_mapped)
+ return -EINVAL;
+
+ buf_status.head = bl->head;
+ if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
+ return -EFAULT;
+
+ return 0;
+}
+
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 9be5960817ea..53dfaa71a397 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -53,6 +53,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 799db44283c7..6705634e5f52 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_FIXED_FD_INSTALL] = {
+ .needs_file = 1,
+ .audit_skip = 1,
+ .prep = io_install_fixed_fd_prep,
+ .issue = io_install_fixed_fd,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FUTEX_WAITV] = {
.name = "FUTEX_WAITV",
},
+ [IORING_OP_FIXED_FD_INSTALL] = {
+ .name = "FIXED_FD_INSTALL",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 74fc22461f48..0fe0dd305546 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -31,6 +31,11 @@ struct io_close {
u32 file_slot;
};
+struct io_fixed_install {
+ struct file *file;
+ unsigned int o_flags;
+};
+
static bool io_openat_force_async(struct io_open *open)
{
/*
@@ -254,3 +259,42 @@ err:
io_req_set_res(req, ret, 0);
return IOU_OK;
}
+
+int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_fixed_install *ifi;
+ unsigned int flags;
+
+ if (sqe->off || sqe->addr || sqe->len || sqe->buf_index ||
+ sqe->splice_fd_in || sqe->addr3)
+ return -EINVAL;
+
+ /* must be a fixed file */
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ flags = READ_ONCE(sqe->install_fd_flags);
+ if (flags & ~IORING_FIXED_FD_NO_CLOEXEC)
+ return -EINVAL;
+
+ /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */
+ ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
+ ifi->o_flags = O_CLOEXEC;
+ if (flags & IORING_FIXED_FD_NO_CLOEXEC)
+ ifi->o_flags = 0;
+
+ return 0;