summaryrefslogtreecommitdiff
path: root/io_uring/io_uring.h
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 09:56:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 09:56:23 -0700
commit5832d26433f2bd0d28f8b12526e3c2fdb203507f (patch)
treec0cdd1df24131bee06e1318cd453e2790fdf654a /io_uring/io_uring.h
parent77633c77eee37ddc160493a4cf6070c166f47dc0 (diff)
parentef9f603fd3d4b7937f2cdbce40e47df0a54b2a55 (diff)
downloadlinux-5832d26433f2bd0d28f8b12526e3c2fdb203507f.tar.gz
linux-5832d26433f2bd0d28f8b12526e3c2fdb203507f.tar.bz2
linux-5832d26433f2bd0d28f8b12526e3c2fdb203507f.zip
Merge tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe: - Store ring provided buffers locally for the users, rather than stuff them into struct io_kiocb. These types of buffers must always be fully consumed or recycled in the current context, and leaving them in struct io_kiocb is hence not a good ideas as that struct has a vastly different life time. Basically just an architecture cleanup that can help prevent issues with ring provided buffers in the future. - Support for mixed CQE sizes in the same ring. Before this change, a CQ ring either used the default 16b CQEs, or it was setup with 32b CQE using IORING_SETUP_CQE32. For use cases where a few 32b CQEs were needed, this caused everything else to use big CQEs. This is wasteful both in terms of memory usage, but also memory bandwidth for the posted CQEs. With IORING_SETUP_CQE_MIXED, applications may use request types that post both normal 16b and big 32b CQEs on the same ring. - Add helpers for async data management, to make it harder for opcode handlers to mess it up. - Add support for multishot for uring_cmd, which ublk can use. This helps improve efficiency, by providing a persistent request type that can trigger multiple CQEs. - Add initial support for ring feature querying. We had basic support for probe operations, but the API isn't great. Rather than expand that, add support for QUERY which is easily expandable and can cover a lot more cases than the existing probe support. This will help applications get a better idea of what operations are supported on a given host. - zcrx improvements from Pavel: - Improve refill entry alignment for better caching - Various cleanups, especially around deduplicating normal memory vs dmabuf setup. - Generalisation of the niov size (Patch 12). It's still hard coded to PAGE_SIZE on init, but will let the user to specify the rx buffer length on setup. - Syscall / synchronous bufer return. It'll be used as a slow fallback path for returning buffers when the refill queue is full. Useful for tolerating slight queue size misconfiguration or with inconsistent load. - Accounting more memory to cgroups. - Additional independent cleanups that will also be useful for mutli-area support. - Various fixes and cleanups * tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits) io_uring/cmd: drop unused res2 param from io_uring_cmd_done() io_uring: fix nvme's 32b cqes on mixed cq io_uring/query: cap number of queries io_uring/query: prevent infinite loops io_uring/zcrx: account niov arrays to cgroup io_uring/zcrx: allow synchronous buffer return io_uring/zcrx: introduce io_parse_rqe() io_uring/zcrx: don't adjust free cache space io_uring/zcrx: use guards for the refill lock io_uring/zcrx: reduce netmem scope in refill io_uring/zcrx: protect netdev with pp_lock io_uring/zcrx: rename dma lock io_uring/zcrx: make niov size variable io_uring/zcrx: set sgt for umem area io_uring/zcrx: remove dmabuf_offset io_uring/zcrx: deduplicate area mapping io_uring/zcrx: pass ifq to io_zcrx_alloc_fallback() io_uring/zcrx: check all niovs filled with dma addresses io_uring/zcrx: move area reg checks into io_import_area io_uring/zcrx: don't pass slot to io_zcrx_create_area ...
Diffstat (limited to 'io_uring/io_uring.h')
-rw-r--r--io_uring/io_uring.h120
1 files changed, 105 insertions, 15 deletions
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 1880902be6fd..46d9141d772a 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -11,13 +11,69 @@
#include "alloc_cache.h"
#include "io-wq.h"
#include "slist.h"
-#include "filetable.h"
#include "opdef.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#endif
+#define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\
+ IORING_FEAT_NODROP |\
+ IORING_FEAT_SUBMIT_STABLE |\
+ IORING_FEAT_RW_CUR_POS |\
+ IORING_FEAT_CUR_PERSONALITY |\
+ IORING_FEAT_FAST_POLL |\
+ IORING_FEAT_POLL_32BITS |\
+ IORING_FEAT_SQPOLL_NONFIXED |\
+ IORING_FEAT_EXT_ARG |\
+ IORING_FEAT_NATIVE_WORKERS |\
+ IORING_FEAT_RSRC_TAGS |\
+ IORING_FEAT_CQE_SKIP |\
+ IORING_FEAT_LINKED_FILE |\
+ IORING_FEAT_REG_REG_RING |\
+ IORING_FEAT_RECVSEND_BUNDLE |\
+ IORING_FEAT_MIN_TIMEOUT |\
+ IORING_FEAT_RW_ATTR |\
+ IORING_FEAT_NO_IOWAIT)
+
+#define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\
+ IORING_SETUP_SQPOLL |\
+ IORING_SETUP_SQ_AFF |\
+ IORING_SETUP_CQSIZE |\
+ IORING_SETUP_CLAMP |\
+ IORING_SETUP_ATTACH_WQ |\
+ IORING_SETUP_R_DISABLED |\
+ IORING_SETUP_SUBMIT_ALL |\
+ IORING_SETUP_COOP_TASKRUN |\
+ IORING_SETUP_TASKRUN_FLAG |\
+ IORING_SETUP_SQE128 |\
+ IORING_SETUP_CQE32 |\
+ IORING_SETUP_SINGLE_ISSUER |\
+ IORING_SETUP_DEFER_TASKRUN |\
+ IORING_SETUP_NO_MMAP |\
+ IORING_SETUP_REGISTERED_FD_ONLY |\
+ IORING_SETUP_NO_SQARRAY |\
+ IORING_SETUP_HYBRID_IOPOLL |\
+ IORING_SETUP_CQE_MIXED)
+
+#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
+ IORING_ENTER_SQ_WAKEUP |\
+ IORING_ENTER_SQ_WAIT |\
+ IORING_ENTER_EXT_ARG |\
+ IORING_ENTER_REGISTERED_RING |\
+ IORING_ENTER_ABS_TIMER |\
+ IORING_ENTER_EXT_ARG_REG |\
+ IORING_ENTER_NO_IOWAIT)
+
+
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\
+ IOSQE_IO_DRAIN |\
+ IOSQE_IO_LINK |\
+ IOSQE_IO_HARDLINK |\
+ IOSQE_ASYNC |\
+ IOSQE_BUFFER_SELECT |\
+ IOSQE_CQE_SKIP_SUCCESS)
+
enum {
IOU_COMPLETE = 0,
@@ -75,7 +131,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset);
int io_uring_fill_params(unsigned entries, struct io_uring_params *p);
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
@@ -169,25 +225,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
struct io_uring_cqe **ret,
- bool overflow)
+ bool overflow, bool cqe32)
{
io_lockdep_assert_cq_locked(ctx);
- if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
- if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
+ if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) {
+ if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32)))
return false;
}
*ret = ctx->cqe_cached;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
- if (ctx->flags & IORING_SETUP_CQE32)
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ ctx->cqe_cached++;
+ } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) {
ctx->cqe_cached++;
+ ctx->cached_cq_tail++;
+ }
+ WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel);
return true;
}
-static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
+static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret,
+ bool cqe32)
{
- return io_get_cqe_overflow(ctx, ret, false);
+ return io_get_cqe_overflow(ctx, ret, false, cqe32);
}
static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
@@ -196,25 +258,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
io_lockdep_assert_cq_locked(ctx);
ctx->submit_state.cq_flush = true;
- return io_get_cqe(ctx, cqe_ret);
+ return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED);
}
static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
+ bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32;
struct io_uring_cqe *cqe;
/*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
+ * If we can't get a cq entry, userspace overflowed the submission
+ * (by quite a lot).
*/
- if (unlikely(!io_get_cqe(ctx, &cqe)))
+ if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32)))
return false;
-
memcpy(cqe, &req->cqe, sizeof(*cqe));
- if (ctx->flags & IORING_SETUP_CQE32) {
+ if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) {
memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
@@ -239,6 +300,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
req->cqe.flags = cflags;
}
+static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx)
+{
+ if (ctx->flags & IORING_SETUP_CQE_MIXED)
+ return IORING_CQE_F_32;
+ return 0;
+}
+
+static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags,
+ __u64 extra1, __u64 extra2)
+{
+ req->cqe.res = res;
+ req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx);
+ req->big_cqe.extra1 = extra1;
+ req->big_cqe.extra2 = extra2;
+}
+
static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
struct io_kiocb *req)
{
@@ -260,6 +337,19 @@ static inline bool req_has_async_data(struct io_kiocb *req)
return req->flags & REQ_F_ASYNC_DATA;
}
+static inline void io_req_async_data_clear(struct io_kiocb *req,
+ io_req_flags_t extra_flags)
+{
+ req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags);
+ req->async_data = NULL;
+}
+
+static inline void io_req_async_data_free(struct io_kiocb *req)
+{
+ kfree(req->async_data);
+ io_req_async_data_clear(req, 0);
+}
+
static inline void io_put_file(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_FIXED_FILE) && req->file)