summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 13:37:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 13:37:55 -0700
commit42df1cbf6a4726934cc5dac12bf263aa73c49fa3 (patch)
tree129e32104dccc660c3c8fca03b8bba418e572d09
parent98e247464088a11ce2328a214fdb87d4c06f8db6 (diff)
parent14b146b688ad9593f5eee93d51a34d09a47e50b5 (diff)
downloadlinux-42df1cbf6a4726934cc5dac12bf263aa73c49fa3.tar.gz
linux-42df1cbf6a4726934cc5dac12bf263aa73c49fa3.tar.bz2
linux-42df1cbf6a4726934cc5dac12bf263aa73c49fa3.zip
Merge tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.kernel.dk/linux-block
Pull io_uring zerocopy support from Jens Axboe: "This adds support for efficient support for zerocopy sends through io_uring. Both ipv4 and ipv6 is supported, as well as both TCP and UDP. The core network changes to support this is in a stable branch from Jakub that both io_uring and net-next has pulled in, and the io_uring changes are layered on top of that. All of the work has been done by Pavel" * tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.kernel.dk/linux-block: (34 commits) io_uring: notification completion optimisation io_uring: export req alloc from core io_uring/net: use unsigned for flags io_uring/net: make page accounting more consistent io_uring/net: checks errors of zc mem accounting io_uring/net: improve io_get_notif_slot types selftests/io_uring: test zerocopy send io_uring: enable managed frags with register buffers io_uring: add zc notification flush requests io_uring: rename IORING_OP_FILES_UPDATE io_uring: flush notifiers after sendzc io_uring: sendzc with fixed buffers io_uring: allow to pass addr into sendzc io_uring: account locked pages for non-fixed zc io_uring: wire send zc request type io_uring: add notification slot registration io_uring: add rsrc referencing for notifiers io_uring: complete notifiers in tw io_uring: cache struct io_notif io_uring: add zc notification infrastructure ...
-rw-r--r--include/linux/io_uring_types.h30
-rw-r--r--include/linux/skbuff.h66
-rw-r--r--include/linux/socket.h5
-rw-r--r--include/uapi/linux/io_uring.h45
-rw-r--r--io_uring/Makefile2
-rw-r--r--io_uring/io_uring.c61
-rw-r--r--io_uring/io_uring.h43
-rw-r--r--io_uring/net.c193
-rw-r--r--io_uring/net.h3
-rw-r--r--io_uring/notif.c159
-rw-r--r--io_uring/notif.h90
-rw-r--r--io_uring/opdef.c24
-rw-r--r--io_uring/rsrc.c67
-rw-r--r--io_uring/rsrc.h25
-rw-r--r--io_uring/tctx.h26
-rw-r--r--net/compat.c1
-rw-r--r--net/core/datagram.c14
-rw-r--r--net/core/skbuff.c37
-rw-r--r--net/ipv4/ip_output.c50
-rw-r--r--net/ipv4/tcp.c33
-rw-r--r--net/ipv6/ip6_output.c49
-rw-r--r--net/socket.c2
-rw-r--r--tools/testing/selftests/net/Makefile1
-rw-r--r--tools/testing/selftests/net/io_uring_zerocopy_tx.c605
-rwxr-xr-xtools/testing/selftests/net/io_uring_zerocopy_tx.sh131
25 files changed, 1604 insertions, 158 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d54b8b7e0746..f7fab3758cb9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -4,6 +4,7 @@
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
+#include <linux/llist.h>
#include <uapi/linux/io_uring.h>
struct io_wq_work_node {
@@ -33,6 +34,9 @@ struct io_file_table {
unsigned int alloc_hint;
};
+struct io_notif;
+struct io_notif_slot;
+
struct io_hash_bucket {
spinlock_t lock;
struct hlist_head list;
@@ -43,6 +47,30 @@ struct io_hash_table {
unsigned hash_bits;
};
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
+struct io_uring_task {
+ /* submission side */
+ int cached_refs;
+ const struct io_ring_ctx *last;
+ struct io_wq *io_wq;
+ struct file *registered_rings[IO_RINGFD_REG_MAX];
+
+ struct xarray xa;
+ struct wait_queue_head wait;
+ atomic_t in_idle;
+ atomic_t inflight_tracked;
+ struct percpu_counter inflight;
+
+ struct { /* task_work */
+ struct llist_head task_list;
+ struct callback_head task_work;
+ } ____cacheline_aligned_in_smp;
+};
+
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
@@ -212,6 +240,8 @@ struct io_ring_ctx {
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
+ struct io_notif_slot *notif_slots;
+ unsigned nr_notif_slots;
struct io_submit_state submit_state;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d3d10556f0fa..1111adefd906 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -686,10 +686,18 @@ enum {
* charged to the kernel memory.
*/
SKBFL_PURE_ZEROCOPY = BIT(2),
+
+ SKBFL_DONT_ORPHAN = BIT(3),
+
+ /* page references are managed by the ubuf_info, so it's safe to
+ * use frags only up until ubuf_info is released
+ */
+ SKBFL_MANAGED_FRAG_REFS = BIT(4),
};
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
-#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
+#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
+ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)
/*
* The callback notifies userspace to release buffers when skb DMA is done in
@@ -1773,13 +1781,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success);
-int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
- struct iov_iter *from, size_t length);
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length);
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
struct msghdr *msg, int len)
{
- return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+ return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
}
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
@@ -1806,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb)
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}
+static inline bool skb_zcopy_managed(const struct sk_buff *skb)
+{
+ return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
+}
+
static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
const struct sk_buff *skb2)
{
@@ -1880,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
}
}
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb);
+
+static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+ if (unlikely(skb_zcopy_managed(skb)))
+ __skb_zcopy_downgrade_managed(skb);
+}
+
static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
skb->next = NULL;
@@ -2528,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
return skb_headlen(skb) + __skb_pagelen(skb);
}
+static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
+ int i, struct page *page,
+ int off, int size)
+{
+ skb_frag_t *frag = &shinfo->frags[i];
+
+ /*
+ * Propagate page pfmemalloc to the skb if we can. The problem is
+ * that not all callers have unique ownership of the page but rely
+ * on page_is_pfmemalloc doing the right thing(tm).
+ */
+ frag->bv_page = page;
+ frag->bv_offset = off;
+ skb_frag_size_set(frag, size);
+}
+
/**
* __skb_fill_page_desc - initialise a paged fragment in an skb
* @skb: buffer containing fragment to be initialised
@@ -2544,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
struct page *page, int off, int size)
{
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
- /*
- * Propagate page pfmemalloc to the skb if we can. The problem is
- * that not all callers have unique ownership of the page but rely
- * on page_is_pfmemalloc doing the right thing(tm).
- */
- frag->bv_page = page;
- frag->bv_offset = off;
- skb_frag_size_set(frag, size);
-
+ __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
page = compound_head(page);
if (page_is_pfmemalloc(page))
skb->pfmemalloc = true;
@@ -3182,8 +3210,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
if (likely(!skb_zcopy(skb)))
return 0;
- if (!skb_zcopy_is_nouarg(skb) &&
- skb_uarg(skb)->callback == msg_zerocopy_callback)
+ if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
@@ -3496,7 +3523,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
*/
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
- __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (!skb_zcopy_managed(skb))
+ __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}
/**
diff --git a/include/linux/socket.h b/include/linux/socket.h
index be24f1c8568a..d4523974efbd 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -14,6 +14,8 @@ struct file;
struct pid;
struct cred;
struct socket;
+struct sock;
+struct sk_buff;
#define __sockaddr_check_size(size) \
BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -69,6 +71,9 @@ struct msghdr {
unsigned int msg_flags; /* flags on received message */
__kernel_size_t msg_controllen; /* ancillary data buffer length */
struct kiocb *msg_iocb; /* ptr to iocb for async requests */
+ struct ubuf_info *msg_ubuf;
+ int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length);
};
struct user_msghdr {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4c9b11e2e991..1463cfecb56b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -66,6 +66,10 @@ struct io_uring_sqe {
union {
__s32 splice_fd_in;
__u32 file_index;
+ struct {
+ __u16 notification_idx;
+ __u16 addr_len;
+ };
};
union {
struct {
@@ -170,7 +174,8 @@ enum io_uring_op {
IORING_OP_FALLOCATE,
IORING_OP_OPENAT,
IORING_OP_CLOSE,
- IORING_OP_FILES_UPDATE,
+ IORING_OP_RSRC_UPDATE,
+ IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
IORING_OP_STATX,
IORING_OP_READ,
IORING_OP_WRITE,
@@ -197,6 +202,7 @@ enum io_uring_op {
IORING_OP_GETXATTR,
IORING_OP_SOCKET,
IORING_OP_URING_CMD,
+ IORING_OP_SENDZC_NOTIF,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -218,6 +224,7 @@ enum io_uring_op {
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
+
/*
* sqe->splice_flags
* extends splice(2) flags
@@ -267,15 +274,32 @@ enum io_uring_op {
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
* the handler will continue to report
* CQEs on behalf of the same SQE.
+ *
+ * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in
+ * the buf_index field.
+ *
+ * IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful
+ * successful. Only for zerocopy sends.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
-#define IORING_RECV_MULTISHOT (1U << 1)
+#define IORING_RECV_MULTISHOT (1U << 1)
+#define IORING_RECVSEND_FIXED_BUF (1U << 2)
+#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3)
/*
* accept flags stored in sqe->ioprio
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)
+
+/*
+ * IORING_OP_RSRC_UPDATE flags
+ */
+enum {
+ IORING_RSRC_UPDATE_FILES,
+ IORING_RSRC_UPDATE_NOTIF,
+};
+
/*
* IORING_OP_MSG_RING command types, stored in sqe->addr
*/
@@ -457,6 +481,10 @@ enum {
/* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
+ /* zerocopy notification API */
+ IORING_REGISTER_NOTIFIERS = 26,
+ IORING_UNREGISTER_NOTIFIERS = 27,
+
/* this goes last */
IORING_REGISTER_LAST
};
@@ -503,6 +531,19 @@ struct io_uring_rsrc_update2 {
__u32 resv2;
};
+struct io_uring_notification_slot {
+ __u64 tag;
+ __u64 resv[3];
+};
+
+struct io_uring_notification_register {
+ __u32 nr_slots;
+ __u32 resv;
+ __u64 resv2;
+ __u64 data;
+ __u64 resv3;
+};
+
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 466639c289be..8cc8e5387a75 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
- cancel.o kbuf.o rsrc.o rw.o opdef.o
+ cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
obj-$(CONFIG_IO_WQ) += io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4b3fd645d023..b54218da075c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -90,6 +90,7 @@
#include "rsrc.h"
#include "cancel.h"
#include "net.h"
+#include "notif.h"
#include "timeout.h"
#include "poll.h"
@@ -608,7 +609,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
return ret;
}
-static void __io_put_task(struct task_struct *task, int nr)
+void __io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
@@ -618,16 +619,7 @@ static void __io_put_task(struct task_struct *task, int nr)
put_task_struct_many(task, nr);
}
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
- if (likely(task == current))
- task->io_uring->cached_refs += nr;
- else
- __io_put_task(task, nr);
-}
-
-static void io_task_refs_refill(struct io_uring_task *tctx)
+void io_task_refs_refill(struct io_uring_task *tctx)
{
unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
@@ -636,15 +628,6 @@ static void io_task_refs_refill(struct io_uring_task *tctx)
tctx->cached_refs += refill;
}
-static inline void io_get_task_refs(int nr)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- tctx->cached_refs -= nr;
- if (unlikely(tctx->cached_refs < 0))
- io_task_refs_refill(tctx);
-}
-
static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
@@ -741,9 +724,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
return &rings->cqes[off];
}
-static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
- u64 user_data, s32 res, u32 cflags,
- bool allow_overflow)
+bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+ bool allow_overflow)
{
struct io_uring_cqe *cqe;
@@ -868,18 +850,13 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock);
}
-static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
-{
- return !ctx->submit_state.free_list.next;
-}
-
/*
* A request might get retired back into the request caches even before opcode
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
-static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
+__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
@@ -920,21 +897,6 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
return true;
}
-static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
-{
- if (unlikely(io_req_cache_empty(ctx)))
- return __io_alloc_req_refill(ctx);
- return true;
-}
-
-static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
-{
- struct io_wq_work_node *node;
-
- node = wq_stack_extract(&ctx->submit_state.free_list);
- return container_of(node, struct io_kiocb, comp_list);
-}
-
static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
@@ -2500,6 +2462,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
}
#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
+ WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
@@ -2676,6 +2639,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_unregister_personality(ctx, index);
if (ctx->rings)
io_poll_remove_all(ctx, NULL, true);
+ io_notif_unregister(ctx);
mutex_unlock(&ctx->uring_lock);
/* failed during ring init, it couldn't have issued any requests */
@@ -3874,6 +3838,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_file_alloc_range(ctx, arg);
break;
+ case IORING_REGISTER_NOTIFIERS:
+ ret = io_notif_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_NOTIFIERS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_notif_unregister(ctx);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 5db0a60dc04e..2f73f83af960 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -33,6 +33,8 @@ void io_req_complete_post(struct io_kiocb *req);
void __io_req_complete_post(struct io_kiocb *req);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
bool allow_overflow);
+bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+ bool allow_overflow);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
@@ -71,6 +73,9 @@ void io_wq_submit_work(struct io_wq_work *work);
void io_free_req(struct io_kiocb *req);
void io_queue_next(struct io_kiocb *req);
+void __io_put_task(struct task_struct *task, int nr);
+void io_task_refs_refill(struct io_uring_task *tctx);
+bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
@@ -258,4 +263,42 @@ static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
__io_commit_cqring_flush(ctx);
}
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+ if (likely(task == current))
+ task->io_uring->cached_refs += nr;
+ else
+ __io_put_task(task, nr);
+}
+
+static inline void io_get_task_refs(int nr)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ tctx->cached_refs -= nr;
+ if (unlikely(tctx->cached_refs < 0))
+ io_task_refs_refill(tctx);
+}
+
+static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
+{
+ return !ctx->submit_state.free_list.next;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+ if (unlikely(io_req_cache_empty(ctx)))
+ return __io_alloc_req_refill(ctx);
+ return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+ struct io_wq_work_node *node;
+
+ node = wq_stack_extract(&ctx->submit_state.free_list);
+ return container_of(node, struct io_kiocb, comp_list);
+}
+
#endif
diff --git a/io_uring/net.c b/io_uring/net.c
index e61efa31c729..32fc3da04e41 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -14,6 +14,8 @@
#include "kbuf.h"
#include "alloc_cache.h"
#include "net.h"
+#include "notif.h"
+#include "rsrc.h"
#if defined(CONFIG_NET)
struct io_shutdown {
@@ -53,10 +55,21 @@ struct io_sr_msg {
struct user_msghdr __user *umsg;
void __user *buf;
};
- int msg_flags;
+ unsigned msg_flags;
+ unsigned flags;
size_t len;
size_t done_io;
- unsigned int flags;
+};
+
+struct io_sendzc {
+ struct file *file;
+ void __user *buf;
+ size_t len;
+ u16 slot_idx;
+ unsigned msg_flags;
+ unsigned flags;
+ unsigned addr_len;
+ void __user *addr;
};
#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
@@ -294,6 +307,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
+ msg.msg_ubuf = NULL;
flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
@@ -783,6 +797,7 @@ retry_multishot:
msg.msg_flags = 0;
msg.msg_controllen = 0;
msg.msg_iocb = NULL;
+ msg.msg_ubuf = NULL;
flags = sr->msg_flags;
if (force_nonblock)
@@ -832,6 +847,180 @@ out_free:
return ret;
}
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sendzc *zc = io_kiocb_to_cmd(req);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
+ return -EINVAL;
+
+ zc->flags = READ_ONCE(sqe->ioprio);
+ if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
+ IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH))
+ return -EINVAL;
+ if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
+ unsigned idx = READ_ONCE(sqe->buf_index);
+
+ if (unlikely(idx >= ctx->nr_user_bufs))
+ return -EFAULT;
+ idx = array_index_nospec(idx, ctx->nr_user_bufs);
+ req->imu = READ_ONCE(ctx->user_bufs[idx]);
+ io_req_set_rsrc_node(req, ctx, 0);
+ }
+
+ zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ zc->len = READ_ONCE(sqe->len);
+ zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ zc->slot_idx = READ_ONCE(sqe->notification_idx);
+ if (zc->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+
+ zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ zc->addr_len = READ_ONCE(sqe->addr_len);
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ zc->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+ return 0;
+}
+
+static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int frag = shinfo->nr_frags;
+ int ret = 0;
+ struct bvec_iter bi;
+ ssize_t copied = 0;
+ unsigned long truesize = 0;
+
+ if (!shinfo->nr_frags)
+ shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
+
+ if (!skb_zcopy_managed(skb) || !iov_iter_is_bvec(from)) {
+ skb_zcopy_downgrade_managed(skb);
+ return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
+ }
+
+ bi.bi_size = min(from->count, length);
+ bi.bi_bvec_done = from->iov_offset;
+ bi.bi_idx = 0;
+
+ while (bi.bi_size && frag < MAX_SKB_FRAGS) {
+ struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
+
+ copied += v.bv_len;
+ truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
+ __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
+ v.bv_offset, v.bv_len);
+ bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
+ }
+ if (bi.bi_size)
+ ret = -EMSGSIZE;
+
+ shinfo->nr_frags = frag;
+ from->bvec += bi.bi_idx;
+ from->nr_segs -= bi.bi_idx;
+ from->count = bi.bi_size;
+ from->iov_offset = bi.bi_bvec_done;
+
+ skb->data_len += copied;
+ skb->len += copied;
+ skb->truesize += truesize;
+
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk_wmem_queued_add(sk, truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
+ return ret;
+}
+
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct sockaddr_storage address;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_sendzc *zc = io_kiocb_to_cmd(req);
+ struct io_notif_slot *notif_slot;
+ struct io_kiocb *notif;
+ struct msghdr msg;
+ struct iovec iov;
+ struct socket *sock;
+ unsigned msg_flags;
+ int ret, min_ret = 0;
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (zc->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ return -EAGAIN;
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
+ if (!notif_slot)
+ return -EINVAL;
+ notif = io_get_notif(ctx, notif_slot);
+ if (!notif)
+ return -ENOMEM;
+
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+
+ if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
+ ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
+ (u64)(uintptr_t)zc->buf, zc->len);
+ if (unlikely(ret))
+ return ret;
+ } else {
+ ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
+ &msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ ret = io_notif_account_mem(notif, zc->len);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (zc->addr) {
+ ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address);
+ if (unlikely(ret < 0))
+ return ret;
+ msg.msg_name = (struct sockaddr *)&address;
+ msg.msg_namelen = zc->addr_len;
+ }
+
+ msg_flags = zc->msg_flags | MSG_ZEROCOPY;
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ msg_flags |= MSG_DONTWAIT;
+ if (msg_flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&msg.msg_iter);
+
+ msg.msg_flags = msg_flags;
+ msg.msg_ubuf = &io_notif_to_data(notif)->uarg;
+ msg.sg_from_iter = io_sg_from_iter;
+ ret = sock_sendmsg(sock, &msg);
+
+ if (unlikely(ret < min_ret)) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ return ret == -ERESTARTSYS ? -EINTR : ret;
+ }
+
+ if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH)
+ io_notif_slot_flush_submit(notif_slot, 0);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
+
int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = io_kiocb_to_cmd(req);
diff --git a/io_uring/net.h b/io_uring/net.h
index db20ce9d6546..7c438d39c089 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -52,6 +52,9 @@ int io_connect_prep_async(struct io_kiocb *req);
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+
void io_netmsg_cache_free(struct io_cache_entry *entry);
#else
static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
diff --git a/io_uring/notif.c b/io_uring/notif.c
new file mode 100644
index 000000000000..b5f989dff9de
--- /dev/null
+++ b/io_uring/notif.c
@@ -0,0 +1,159 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/io_uring.h>
+
+#include "io_uring.h"
+#include "notif.h"
+#include "rsrc.h"
+
+static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
+{
+ struct io_notif_data *nd = io_notif_to_data(notif);
+ struct io_ring_ctx *ctx = notif->ctx;
+
+ if (nd->account_pages && ctx->user) {
+ __io_unaccount_mem(ctx->user, nd->account_pages);
+ nd->account_pages = 0;
+ }
+ io_req_task_complete(notif, locked);
+}
+
+static inline void io_notif_complete(struct io_kiocb *notif)
+ __must_hold(&notif->ctx->uring_lock)
+{
+ bool locked = true;
+
+ __io_notif_complete_tw(notif, &locked);
+}
+
+static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
+ struct ubuf_info *uarg,
+ bool success)
+{
+ struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
+ struct io_kiocb *notif = cmd_to_io_kiocb(nd);
+
+ if (refcount_dec_and_test(&uarg->refcnt)) {
+ notif->io_task_work.func = __io_notif_complete_tw;
+ io_req_task_work_add(notif);
+ }
+}
+
+struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
+ struct io_notif_slot *slot)
+ __must_hold(&ctx->uring_lock)
+{
+ struct io_kiocb *notif;
+ struct io_notif_data *nd;
+
+ if (unlikely(!io_alloc_req_refill(ctx)))
+ return NULL;
+ notif = io_alloc_req(ctx);
+ notif->opcode = IORING_OP_NOP;
+ notif->flags = 0;
+ notif->file = NULL;
+ notif->task = current;
+ io_get_task_refs(1);
+ notif->rsrc_node = NULL;
+ io_req_set_rsrc_node(notif, ctx, 0);
+ notif->cqe.user_data = slot->tag;
+ notif->cqe.flags = slot->seq++;
+ notif->cqe.res = 0;
+
+ nd = io_notif_to_data(notif);
+ nd->account_pages = 0;
+ nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ nd->uarg.callback = io_uring_tx_zerocopy_callback;
+ /* master ref owned by io_notif_slot, will be dropped on flush */
+ refcount_set(