diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-20 20:27:33 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-20 20:27:33 -0800 |
| commit | a312e1706ce6c124f04ec85ddece240f3bb2a696 (patch) | |
| tree | ba749bac4f345fc48c7c79d9a9c5713fe87af300 /io_uring/register.c | |
| parent | 1cbfb828e05171ca2dd77b5988d068e6872480fe (diff) | |
| parent | 561e3a0c40dc7e3ab7b0b3647a2b89eca16215d9 (diff) | |
| download | linux-a312e1706ce6c124f04ec85ddece240f3bb2a696.tar.gz linux-a312e1706ce6c124f04ec85ddece240f3bb2a696.tar.bz2 linux-a312e1706ce6c124f04ec85ddece240f3bb2a696.zip | |
Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
"Not a lot in terms of features this time around, mostly just cleanups
and code consolidation:
- Support for PI meta data read/write via io_uring, with NVMe and
SCSI covered
- Cleanup the per-op structure caching, making it consistent across
various command types
- Consolidate the various user mapped features into a concept called
regions, making the various users of that consistent
- Various cleanups and fixes"
* tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux: (56 commits)
io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname
io_uring: reuse io_should_terminate_tw() for cmds
io_uring: Factor out a function to parse restrictions
io_uring/rsrc: require cloned buffers to share accounting contexts
io_uring: simplify the SQPOLL thread check when cancelling requests
io_uring: expose read/write attribute capability
io_uring/rw: don't gate retry on completion context
io_uring/rw: handle -EAGAIN retry at IO completion time
io_uring/rw: use io_rw_recycle() from cleanup path
io_uring/rsrc: simplify the bvec iter count calculation
io_uring: ensure io_queue_deferred() is out-of-line
io_uring/rw: always clear ->bytes_done on io_async_rw setup
io_uring/rw: use NULL for rw->free_iovec assigment
io_uring/rw: don't mask in f_iocb_flags
io_uring/msg_ring: Drop custom destructor
io_uring: Move old async data allocation helper to header
io_uring/rw: Allocate async data through helper
io_uring/net: Allocate msghdr async data through helper
io_uring/uring_cmd: Allocate async data through generic helper
io_uring/poll: Allocate apoll with generic alloc_cache helper
...
Diffstat (limited to 'io_uring/register.c')
| -rw-r--r-- | io_uring/register.c | 163 |
1 files changed, 80 insertions, 83 deletions
diff --git a/io_uring/register.c b/io_uring/register.c index 371aec87e078..05025047d1da 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -104,21 +104,13 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) +static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, + struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; @@ -130,47 +122,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, if (IS_ERR(res)) return PTR_ERR(res); - ret = 0; + ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); + if (res[i].register_op >= IORING_REGISTER_LAST) + goto err; + __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + if (res[i].sqe_op >= IORING_OP_LAST) + goto err; + __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + restrictions->sqe_flags_required = res[i].sqe_flags; break; default: - ret = -EINVAL; - goto out; + goto err; } } -out: + ret = 0; + +err: + kfree(res); + return ret; +} + +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + int ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; - - kfree(res); return ret; } @@ -367,28 +369,19 @@ static int io_register_clock(struct io_ring_ctx *ctx, * either mapping or freeing. */ struct io_ring_ctx_rings { - unsigned short n_ring_pages; - unsigned short n_sqe_pages; - struct page **ring_pages; - struct page **sqe_pages; - struct io_uring_sqe *sq_sqes; struct io_rings *rings; + struct io_uring_sqe *sq_sqes; + + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; }; -static void io_register_free_rings(struct io_uring_params *p, +static void io_register_free_rings(struct io_ring_ctx *ctx, + struct io_uring_params *p, struct io_ring_ctx_rings *r) { - if (!(p->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, - true); - io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, - true); - } else { - io_pages_free(&r->ring_pages, r->n_ring_pages); - io_pages_free(&r->sqe_pages, r->n_sqe_pages); - vunmap(r->rings); - vunmap(r->sq_sqes); - } + io_free_region(ctx, &r->sq_region); + io_free_region(ctx, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -403,11 +396,11 @@ static void io_register_free_rings(struct io_uring_params *p, static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; unsigned i, tail, old_head; struct io_uring_params p; - void *ptr; int ret; /* for single issuer, must be owner resizing */ @@ -441,13 +434,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (size == SIZE_MAX) return -EOVERFLOW; - if (!(p.flags & IORING_SETUP_NO_MMAP)) - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); - else - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, - p.cq_off.user_addr, size); - if (IS_ERR(n.rings)) - return PTR_ERR(n.rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); /* * At this point n.rings is shared with userspace, just like o.rings @@ -463,7 +461,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EFAULT; } @@ -472,20 +470,22 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) else size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); if (size == SIZE_MAX) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EOVERFLOW; } - if (!(p.flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); - else - ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, - p.sq_off.user_addr, - size); - if (IS_ERR(ptr)) { - io_register_free_rings(&p, &n); - return PTR_ERR(ptr); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; } + n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread @@ -497,15 +497,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) } /* - * We'll do the swap. Grab the ctx->resize_lock, which will exclude + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude * any new mmap's on the ring fd. Clear out existing mappings to prevent * mmap from seeing them, as we'll unmap them. Any attempt to mmap * existing rings beyond this point will fail. Not that it could proceed * at this point anyway, as the io_uring mmap side needs go grab the - * ctx->resize_lock as well. Likewise, hold the completion lock over the + * ctx->mmap_lock as well. Likewise, hold the completion lock over the * duration of the actual swap. */ - mutex_lock(&ctx->resize_lock); + mutex_lock(&ctx->mmap_lock); spin_lock(&ctx->completion_lock); o.rings = ctx->rings; ctx->rings = NULL; @@ -516,7 +516,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ - n.sq_sqes = ptr; tail = READ_ONCE(o.rings->sq.tail); old_head = READ_ONCE(o.rings->sq.head); if (tail - old_head > p.sq_entries) @@ -527,8 +526,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) n.sq_sqes[dst_head] = o.sq_sqes[src_head]; } - WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head)); - WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail)); + WRITE_ONCE(n.rings->sq.head, old_head); + WRITE_ONCE(n.rings->sq.tail, tail); tail = READ_ONCE(o.rings->cq.tail); old_head = READ_ONCE(o.rings->cq.head); @@ -547,8 +546,8 @@ overflow: n.rings->cqes[dst_head] = o.rings->cqes[src_head]; } - WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head)); - WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail)); + WRITE_ONCE(n.rings->cq.head, old_head); + WRITE_ONCE(n.rings->cq.tail, tail); /* invalidate cached cqe refill */ ctx->cqe_cached = ctx->cqe_sentinel = NULL; @@ -566,16 +565,14 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; - swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, n_sqe_pages); - swap_old(ctx, o, n, ring_pages); - swap_old(ctx, o, n, sqe_pages); + swap_old(ctx, o, n, ring_region); + swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: spin_unlock(&ctx->completion_lock); - mutex_unlock(&ctx->resize_lock); - io_register_free_rings(&p, to_free); + mutex_unlock(&ctx->mmap_lock); + io_register_free_rings(ctx, &p, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); @@ -598,7 +595,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) rd_uptr = u64_to_user_ptr(reg.region_uptr); if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; - if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) @@ -613,7 +609,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; - ret = io_create_region(ctx, &ctx->param_region, &rd); + ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, + IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { |
