diff options
| -rw-r--r-- | include/linux/io_uring_types.h | 30 | ||||
| -rw-r--r-- | include/linux/skbuff.h | 66 | ||||
| -rw-r--r-- | include/linux/socket.h | 5 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 45 | ||||
| -rw-r--r-- | io_uring/Makefile | 2 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 61 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 43 | ||||
| -rw-r--r-- | io_uring/net.c | 193 | ||||
| -rw-r--r-- | io_uring/net.h | 3 | ||||
| -rw-r--r-- | io_uring/notif.c | 159 | ||||
| -rw-r--r-- | io_uring/notif.h | 90 | ||||
| -rw-r--r-- | io_uring/opdef.c | 24 | ||||
| -rw-r--r-- | io_uring/rsrc.c | 67 | ||||
| -rw-r--r-- | io_uring/rsrc.h | 25 | ||||
| -rw-r--r-- | io_uring/tctx.h | 26 | ||||
| -rw-r--r-- | net/compat.c | 1 | ||||
| -rw-r--r-- | net/core/datagram.c | 14 | ||||
| -rw-r--r-- | net/core/skbuff.c | 37 | ||||
| -rw-r--r-- | net/ipv4/ip_output.c | 50 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 33 | ||||
| -rw-r--r-- | net/ipv6/ip6_output.c | 49 | ||||
| -rw-r--r-- | net/socket.c | 2 | ||||
| -rw-r--r-- | tools/testing/selftests/net/Makefile | 1 | ||||
| -rw-r--r-- | tools/testing/selftests/net/io_uring_zerocopy_tx.c | 605 | ||||
| -rwxr-xr-x | tools/testing/selftests/net/io_uring_zerocopy_tx.sh | 131 |
25 files changed, 1604 insertions, 158 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d54b8b7e0746..f7fab3758cb9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -4,6 +4,7 @@ #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/bitmap.h> +#include <linux/llist.h> #include <uapi/linux/io_uring.h> struct io_wq_work_node { @@ -33,6 +34,9 @@ struct io_file_table { unsigned int alloc_hint; }; +struct io_notif; +struct io_notif_slot; + struct io_hash_bucket { spinlock_t lock; struct hlist_head list; @@ -43,6 +47,30 @@ struct io_hash_table { unsigned hash_bits; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + +struct io_uring_task { + /* submission side */ + int cached_refs; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct file *registered_rings[IO_RINGFD_REG_MAX]; + + struct xarray xa; + struct wait_queue_head wait; + atomic_t in_idle; + atomic_t inflight_tracked; + struct percpu_counter inflight; + + struct { /* task_work */ + struct llist_head task_list; + struct callback_head task_work; + } ____cacheline_aligned_in_smp; +}; + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -212,6 +240,8 @@ struct io_ring_ctx { unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; + struct io_notif_slot *notif_slots; + unsigned nr_notif_slots; struct io_submit_state submit_state; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d3d10556f0fa..1111adefd906 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -686,10 +686,18 @@ enum { * charged to the kernel memory. */ SKBFL_PURE_ZEROCOPY = BIT(2), + + SKBFL_DONT_ORPHAN = BIT(3), + + /* page references are managed by the ubuf_info, so it's safe to + * use frags only up until ubuf_info is released + */ + SKBFL_MANAGED_FRAG_REFS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) -#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ + SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1773,13 +1781,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success); -int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length); +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length); static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, @@ -1806,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb) return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; } +static inline bool skb_zcopy_managed(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; +} + static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, const struct sk_buff *skb2) { @@ -1880,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) } } +void __skb_zcopy_downgrade_managed(struct sk_buff *skb); + +static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) +{ + if (unlikely(skb_zcopy_managed(skb))) + __skb_zcopy_downgrade_managed(skb); +} + static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; @@ -2528,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) return skb_headlen(skb) + __skb_pagelen(skb); } +static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, + int i, struct page *page, + int off, int size) +{ + skb_frag_t *frag = &shinfo->frags[i]; + + /* + * Propagate page pfmemalloc to the skb if we can. The problem is + * that not all callers have unique ownership of the page but rely + * on page_is_pfmemalloc doing the right thing(tm). + */ + frag->bv_page = page; + frag->bv_offset = off; + skb_frag_size_set(frag, size); +} + /** * __skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised @@ -2544,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - /* - * Propagate page pfmemalloc to the skb if we can. The problem is - * that not all callers have unique ownership of the page but rely - * on page_is_pfmemalloc doing the right thing(tm). - */ - frag->bv_page = page; - frag->bv_offset = off; - skb_frag_size_set(frag, size); - + __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; @@ -3182,8 +3210,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; - if (!skb_zcopy_is_nouarg(skb) && - skb_uarg(skb)->callback == msg_zerocopy_callback) + if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN) return 0; return skb_copy_ubufs(skb, gfp_mask); } @@ -3496,7 +3523,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); + struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (!skb_zcopy_managed(skb)) + __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } /** diff --git a/include/linux/socket.h b/include/linux/socket.h index be24f1c8568a..d4523974efbd 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -14,6 +14,8 @@ struct file; struct pid; struct cred; struct socket; +struct sock; +struct sk_buff; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) @@ -69,6 +71,9 @@ struct msghdr { unsigned int msg_flags; /* flags on received message */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ + struct ubuf_info *msg_ubuf; + int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); }; struct user_msghdr { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4c9b11e2e991..1463cfecb56b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -66,6 +66,10 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + struct { + __u16 notification_idx; + __u16 addr_len; + }; }; union { struct { @@ -170,7 +174,8 @@ enum io_uring_op { IORING_OP_FALLOCATE, IORING_OP_OPENAT, IORING_OP_CLOSE, - IORING_OP_FILES_UPDATE, + IORING_OP_RSRC_UPDATE, + IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE, IORING_OP_STATX, IORING_OP_READ, IORING_OP_WRITE, @@ -197,6 +202,7 @@ enum io_uring_op { IORING_OP_GETXATTR, IORING_OP_SOCKET, IORING_OP_URING_CMD, + IORING_OP_SENDZC_NOTIF, /* this goes last, obviously */ IORING_OP_LAST, @@ -218,6 +224,7 @@ enum io_uring_op { #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) + /* * sqe->splice_flags * extends splice(2) flags @@ -267,15 +274,32 @@ enum io_uring_op { * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if * the handler will continue to report * CQEs on behalf of the same SQE. + * + * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in + * the buf_index field. + * + * IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful + * successful. Only for zerocopy sends. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) -#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECVSEND_FIXED_BUF (1U << 2) +#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3) /* * accept flags stored in sqe->ioprio */ #define IORING_ACCEPT_MULTISHOT (1U << 0) + +/* + * IORING_OP_RSRC_UPDATE flags + */ +enum { + IORING_RSRC_UPDATE_FILES, + IORING_RSRC_UPDATE_NOTIF, +}; + /* * IORING_OP_MSG_RING command types, stored in sqe->addr */ @@ -457,6 +481,10 @@ enum { /* register a range of fixed file slots for automatic slot allocation */ IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* zerocopy notification API */ + IORING_REGISTER_NOTIFIERS = 26, + IORING_UNREGISTER_NOTIFIERS = 27, + /* this goes last */ IORING_REGISTER_LAST }; @@ -503,6 +531,19 @@ struct io_uring_rsrc_update2 { __u32 resv2; }; +struct io_uring_notification_slot { + __u64 tag; + __u64 resv[3]; +}; + +struct io_uring_notification_register { + __u32 nr_slots; + __u32 resv; + __u64 resv2; + __u64 data; + __u64 resv3; +}; + /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2) diff --git a/io_uring/Makefile b/io_uring/Makefile index 466639c289be..8cc8e5387a75 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o + cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4b3fd645d023..b54218da075c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -90,6 +90,7 @@ #include "rsrc.h" #include "cancel.h" #include "net.h" +#include "notif.h" #include "timeout.h" #include "poll.h" @@ -608,7 +609,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -static void __io_put_task(struct task_struct *task, int nr) +void __io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; @@ -618,16 +619,7 @@ static void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - -static void io_task_refs_refill(struct io_uring_task *tctx) +void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; @@ -636,15 +628,6 @@ static void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static inline void io_get_task_refs(int nr) -{ - struct io_uring_task *tctx = current->io_uring; - - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) - io_task_refs_refill(tctx); -} - static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; @@ -741,9 +724,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) return &rings->cqes[off]; } -static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { struct io_uring_cqe *cqe; @@ -868,18 +850,13 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } -static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) -{ - return !ctx->submit_state.free_list.next; -} - /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ -static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) +__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; @@ -920,21 +897,6 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) return true; } -static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) -{ - if (unlikely(io_req_cache_empty(ctx))) - return __io_alloc_req_refill(ctx); - return true; -} - -static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) -{ - struct io_wq_work_node *node; - - node = wq_stack_extract(&ctx->submit_state.free_list); - return container_of(node, struct io_kiocb, comp_list); -} - static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; @@ -2500,6 +2462,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); @@ -2676,6 +2639,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); if (ctx->rings) io_poll_remove_all(ctx, NULL, true); + io_notif_unregister(ctx); mutex_unlock(&ctx->uring_lock); /* failed during ring init, it couldn't have issued any requests */ @@ -3874,6 +3838,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_file_alloc_range(ctx, arg); break; + case IORING_REGISTER_NOTIFIERS: + ret = io_notif_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_NOTIFIERS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_notif_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 5db0a60dc04e..2f73f83af960 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -33,6 +33,8 @@ void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); @@ -71,6 +73,9 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); +void __io_put_task(struct task_struct *task, int nr); +void io_task_refs_refill(struct io_uring_task *tctx); +bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); @@ -258,4 +263,42 @@ static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) __io_commit_cqring_flush(ctx); } +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); +} + +static inline void io_get_task_refs(int nr) +{ + struct io_uring_task *tctx = current->io_uring; + + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); +} + +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) +{ + return !ctx->submit_state.free_list.next; +} + +static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +{ + if (unlikely(io_req_cache_empty(ctx))) + return __io_alloc_req_refill(ctx); + return true; +} + +static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +{ + struct io_wq_work_node *node; + + node = wq_stack_extract(&ctx->submit_state.free_list); + return container_of(node, struct io_kiocb, comp_list); +} + #endif diff --git a/io_uring/net.c b/io_uring/net.c index e61efa31c729..32fc3da04e41 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -14,6 +14,8 @@ #include "kbuf.h" #include "alloc_cache.h" #include "net.h" +#include "notif.h" +#include "rsrc.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -53,10 +55,21 @@ struct io_sr_msg { struct user_msghdr __user *umsg; void __user *buf; }; - int msg_flags; + unsigned msg_flags; + unsigned flags; size_t len; size_t done_io; - unsigned int flags; +}; + +struct io_sendzc { + struct file *file; + void __user *buf; + size_t len; + u16 slot_idx; + unsigned msg_flags; + unsigned flags; + unsigned addr_len; + void __user *addr; }; #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) @@ -294,6 +307,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; + msg.msg_ubuf = NULL; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -783,6 +797,7 @@ retry_multishot: msg.msg_flags = 0; msg.msg_controllen = 0; msg.msg_iocb = NULL; + msg.msg_ubuf = NULL; flags = sr->msg_flags; if (force_nonblock) @@ -832,6 +847,180 @@ out_free: return ret; } +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sendzc *zc = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + + if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)) + return -EINVAL; + + zc->flags = READ_ONCE(sqe->ioprio); + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | + IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH)) + return -EINVAL; + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { + unsigned idx = READ_ONCE(sqe->buf_index); + + if (unlikely(idx >= ctx->nr_user_bufs)) + return -EFAULT; + idx = array_index_nospec(idx, ctx->nr_user_bufs); + req->imu = READ_ONCE(ctx->user_bufs[idx]); + io_req_set_rsrc_node(req, ctx, 0); + } + + zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + zc->len = READ_ONCE(sqe->len); + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->slot_idx = READ_ONCE(sqe->notification_idx); + if (zc->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + + zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + zc->addr_len = READ_ONCE(sqe->addr_len); + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + zc->msg_flags |= MSG_CMSG_COMPAT; +#endif + return 0; +} + +static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int frag = shinfo->nr_frags; + int ret = 0; + struct bvec_iter bi; + ssize_t copied = 0; + unsigned long truesize = 0; + + if (!shinfo->nr_frags) + shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; + + if (!skb_zcopy_managed(skb) || !iov_iter_is_bvec(from)) { + skb_zcopy_downgrade_managed(skb); + return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + } + + bi.bi_size = min(from->count, length); + bi.bi_bvec_done = from->iov_offset; + bi.bi_idx = 0; + + while (bi.bi_size && frag < MAX_SKB_FRAGS) { + struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); + + copied += v.bv_len; + truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); + __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, + v.bv_offset, v.bv_len); + bvec_iter_advance_single(from->bvec, &bi, v.bv_len); + } + if (bi.bi_size) + ret = -EMSGSIZE; + + shinfo->nr_frags = frag; + from->bvec += bi.bi_idx; + from->nr_segs -= bi.bi_idx; + from->count = bi.bi_size; + from->iov_offset = bi.bi_bvec_done; + + skb->data_len += copied; + skb->len += copied; + skb->truesize += truesize; + + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } + return ret; +} + +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct sockaddr_storage address; + struct io_ring_ctx *ctx = req->ctx; + struct io_sendzc *zc = io_kiocb_to_cmd(req); + struct io_notif_slot *notif_slot; + struct io_kiocb *notif; + struct msghdr msg; + struct iovec iov; + struct socket *sock; + unsigned msg_flags; + int ret, min_ret = 0; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + if (issue_flags & IO_URING_F_UNLOCKED) + return -EAGAIN; + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + notif_slot = io_get_notif_slot(ctx, zc->slot_idx); + if (!notif_slot) + return -EINVAL; + notif = io_get_notif(ctx, notif_slot); + if (!notif) + return -ENOMEM; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { + ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu, + (u64)(uintptr_t)zc->buf, zc->len); + if (unlikely(ret)) + return ret; + } else { + ret = import_single_range(WRITE, zc->buf, zc->len, &iov, + &msg.msg_iter); + if (unlikely(ret)) + return ret; + ret = io_notif_account_mem(notif, zc->len); + if (unlikely(ret)) + return ret; + } + + if (zc->addr) { + ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address); + if (unlikely(ret < 0)) + return ret; + msg.msg_name = (struct sockaddr *)&address; + msg.msg_namelen = zc->addr_len; + } + + msg_flags = zc->msg_flags | MSG_ZEROCOPY; + if (issue_flags & IO_URING_F_NONBLOCK) + msg_flags |= MSG_DONTWAIT; + if (msg_flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + msg.msg_flags = msg_flags; + msg.msg_ubuf = &io_notif_to_data(notif)->uarg; + msg.sg_from_iter = io_sg_from_iter; + ret = sock_sendmsg(sock, &msg); + + if (unlikely(ret < min_ret)) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + return ret == -ERESTARTSYS ? -EINTR : ret; + } + + if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH) + io_notif_slot_flush_submit(notif_slot, 0); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req); diff --git a/io_uring/net.h b/io_uring/net.h index db20ce9d6546..7c438d39c089 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -52,6 +52,9 @@ int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags); +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + void io_netmsg_cache_free(struct io_cache_entry *entry); #else static inline void io_netmsg_cache_free(struct io_cache_entry *entry) diff --git a/io_uring/notif.c b/io_uring/notif.c new file mode 100644 index 000000000000..b5f989dff9de --- /dev/null +++ b/io_uring/notif.c @@ -0,0 +1,159 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/net.h> +#include <linux/io_uring.h> + +#include "io_uring.h" +#include "notif.h" +#include "rsrc.h" + +static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) +{ + struct io_notif_data *nd = io_notif_to_data(notif); + struct io_ring_ctx *ctx = notif->ctx; + + if (nd->account_pages && ctx->user) { + __io_unaccount_mem(ctx->user, nd->account_pages); + nd->account_pages = 0; + } + io_req_task_complete(notif, locked); +} + +static inline void io_notif_complete(struct io_kiocb *notif) + __must_hold(¬if->ctx->uring_lock) +{ + bool locked = true; + + __io_notif_complete_tw(notif, &locked); +} + +static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, + struct ubuf_info *uarg, + bool success) +{ + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); + struct io_kiocb *notif = cmd_to_io_kiocb(nd); + + if (refcount_dec_and_test(&uarg->refcnt)) { + notif->io_task_work.func = __io_notif_complete_tw; + io_req_task_work_add(notif); + } +} + +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, + struct io_notif_slot *slot) + __must_hold(&ctx->uring_lock) +{ + struct io_kiocb *notif; + struct io_notif_data *nd; + + if (unlikely(!io_alloc_req_refill(ctx))) + return NULL; + notif = io_alloc_req(ctx); + notif->opcode = IORING_OP_NOP; + notif->flags = 0; + notif->file = NULL; + notif->task = current; + io_get_task_refs(1); + notif->rsrc_node = NULL; + io_req_set_rsrc_node(notif, ctx, 0); + notif->cqe.user_data = slot->tag; + notif->cqe.flags = slot->seq++; + notif->cqe.res = 0; + + nd = io_notif_to_data(notif); + nd->account_pages = 0; + nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + nd->uarg.callback = io_uring_tx_zerocopy_callback; + /* master ref owned by io_notif_slot, will be dropped on flush */ + refcount_set(&nd->uarg.refcnt, 1); + return notif; +} + +void io_notif_slot_flush(struct io_notif_slot *slot) + __must_hold(&ctx->uring_lock) +{ + struct io_kiocb *notif = slot->notif; + struct io_notif_data *nd = io_notif_to_data(notif); + + slot->notif = NULL; + + /* drop slot's master ref */ + if (refcount_dec_and_test(&nd->uarg.refcnt)) + io_notif_complete(notif); +} + +__cold int io_notif_unregister(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + int i; + + if (!ctx->notif_slots) + return -ENXIO; + + for (i = 0; i < ctx->nr_notif_slots; i++) { + struct io_notif_slot *slot = &ctx->notif_slots[i]; + struct io_kiocb *notif = slot->notif; + struct io_notif_data *nd; + + if (!notif) + continue; + nd = io_kiocb_to_cmd(notif); + slot->notif = NULL; + if (!refcount_dec_and_test(&nd->uarg.refcnt)) + continue; + notif->io_task_work.func = __io_notif_complete_tw; + io_req_task_work_add(notif); + } + + kvfree(ctx->notif_slots); + ctx->notif_slots = NULL; + ctx->nr_notif_slots = 0; + return 0; +} + +__cold int io_notif_register(struct io_ring_ctx *ctx, + void __user *arg, unsigned int size) + __must_hold(&ctx->uring_lock) +{ + struct io_uring_notification_slot __user *slots; + struct io_uring_notification_slot slot; + struct io_uring_notification_register reg; + unsigned i; + + BUILD_BUG_ON(sizeof(struct io_notif_data) > 64); + + if (ctx->nr_notif_slots) + return -EBUSY; + if (size != sizeof(reg)) + return -EINVAL; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS) + return -EINVAL; + if (reg.resv || reg.resv2 || reg.resv3) + return -EINVAL; + + slots = u64_to_user_ptr(reg.data); + ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]), + GFP_KERNEL_A |
