summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-04-03 15:48:58 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-04-03 15:48:58 -0700
commit7930edcc3a7e2166477fbd99e62d2960f63cd9c9 (patch)
treef83acbafbd8729bffacf0fa2203d870bd352bc75
parentc0dbd11ada2c94edc337a5f6665cbaa6079ff785 (diff)
parent390513642ee6763c7ada07f0a1470474986e6c1c (diff)
downloadlinux-7930edcc3a7e2166477fbd99e62d2960f63cd9c9.tar.gz
linux-7930edcc3a7e2166477fbd99e62d2960f63cd9c9.tar.bz2
linux-7930edcc3a7e2166477fbd99e62d2960f63cd9c9.zip
Merge tag 'io_uring-6.15-20250403' of git://git.kernel.dk/linux
Pull more io_uring updates from Jens Axboe: "Set of fixes/updates for io_uring that should go into this release. The ublk bits could've gone via either tree - usually I put them in block, but they got a bit mixed this series with the zero-copy supported that ended up dipping into both trees. This contains: - Fix for sendmsg zc, include in pinned pages accounting like we do for the other zc types - Series for ublk fixing request aborting, doing various little cleanups, fixing some zc issues, and adding queue_rqs support - Another ublk series doing some code cleanups - Series cleaning up the io_uring send path, mostly in preparation for registered buffers - Series doing little MSG_RING cleanups - Fix for the newly added zc rx, fixing len being 0 for the last invocation of the callback - Add vectored registered buffer support for ublk. With that, then ublk also supports this feature in the kernel revision where it could generically introduced for rw/net - A bunch of selftest additions for ublk. This is the majority of the diffstat - Silence a KCSAN data race warning for io-wq - Various little cleanups and fixes" * tag 'io_uring-6.15-20250403' of git://git.kernel.dk/linux: (44 commits) io_uring: always do atomic put from iowq selftests: ublk: enable zero copy for stripe target io_uring: support vectored kernel fixed buffer block: add for_each_mp_bvec() io_uring: add validate_fixed_range() for validate fixed buffer selftests: ublk: kublk: fix an error log line selftests: ublk: kublk: use ioctl-encoded opcodes io_uring/zcrx: return early from io_zcrx_recv_skb if readlen is 0 io_uring/net: avoid import_ubuf for regvec send io_uring/rsrc: check size when importing reg buffer io_uring: cleanup {g,s]etsockopt sqe reading io_uring: hide caches sqes from drivers io_uring: make zcrx depend on CONFIG_IO_URING io_uring: add req flag invariant build assertion Documentation: ublk: remove dead footnote selftests: ublk: specify io_cmd_buf pointer type ublk: specify io_cmd_buf pointer type io_uring: don't pass ctx to tw add remote helper io_uring/msg: initialise msg request opcode io_uring/msg: rename io_double_lock_ctx() ...
-rw-r--r--Documentation/block/ublk.rst37
-rw-r--r--drivers/block/ublk_drv.c223
-rw-r--r--include/linux/bvec.h6
-rw-r--r--include/linux/io_uring/cmd.h1
-rw-r--r--include/uapi/linux/ublk_cmd.h25
-rw-r--r--io_uring/Kconfig1
-rw-r--r--io_uring/io_uring.c18
-rw-r--r--io_uring/io_uring.h3
-rw-r--r--io_uring/msg_ring.c11
-rw-r--r--io_uring/net.c135
-rw-r--r--io_uring/refs.h7
-rw-r--r--io_uring/rsrc.c126
-rw-r--r--io_uring/uring_cmd.c22
-rw-r--r--io_uring/uring_cmd.h1
-rw-r--r--io_uring/zcrx.c8
-rw-r--r--tools/testing/selftests/ublk/Makefile5
-rw-r--r--tools/testing/selftests/ublk/kublk.c8
-rw-r--r--tools/testing/selftests/ublk/kublk.h4
-rw-r--r--tools/testing/selftests/ublk/null.c11
-rw-r--r--tools/testing/selftests/ublk/stripe.c69
-rwxr-xr-xtools/testing/selftests/ublk/test_common.sh6
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_02.sh44
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_03.sh28
-rwxr-xr-xtools/testing/selftests/ublk/test_loop_01.sh14
-rwxr-xr-xtools/testing/selftests/ublk/test_loop_03.sh14
-rwxr-xr-xtools/testing/selftests/ublk/test_loop_05.sh28
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_01.sh6
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_02.sh6
-rwxr-xr-xtools/testing/selftests/ublk/test_stripe_01.sh14
-rwxr-xr-xtools/testing/selftests/ublk/test_stripe_03.sh30
30 files changed, 673 insertions, 238 deletions
diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst
index 1e0e7358e14a..854f823b46c2 100644
--- a/Documentation/block/ublk.rst
+++ b/Documentation/block/ublk.rst
@@ -309,18 +309,35 @@ with specified IO tag in the command data:
``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy
the server buffer (pages) read to the IO request pages.
-Future development
-==================
-
Zero copy
---------
-Zero copy is a generic requirement for nbd, fuse or similar drivers. A
-problem [#xiaoguang]_ Xiaoguang mentioned is that pages mapped to userspace
-can't be remapped any more in kernel with existing mm interfaces. This can
-occurs when destining direct IO to ``/dev/ublkb*``. Also, he reported that
-big requests (IO size >= 256 KB) may benefit a lot from zero copy.
-
+ublk zero copy relies on io_uring's fixed kernel buffer, which provides
+two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`.
+
+ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call
+`io_buffer_register_bvec()` for ublk server to register client request
+buffer into io_uring buffer table, then ublk server can submit io_uring
+IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF`
+calls `io_buffer_unregister_bvec()` to unregister the buffer, which is
+guaranteed to be live between calling `io_buffer_register_bvec()` and
+`io_buffer_unregister_bvec()`. Any io_uring operation which supports this
+kind of kernel buffer will grab one reference of the buffer until the
+operation is completed.
+
+ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and
+be trusted, because it is ublk server's responsibility to make sure IO buffer
+filled with data for handling read command, and ublk server has to return
+correct result to ublk driver when handling READ command, and the result
+has to match with how many bytes filled to the IO buffer. Otherwise,
+uninitialized kernel IO buffer will be exposed to client application.
+
+ublk server needs to align the parameter of `struct ublk_param_dma_align`
+with backend for zero copy to work correctly.
+
+For reaching best IO performance, ublk server should align its segment
+parameter of `struct ublk_param_segment` with backend for avoiding
+unnecessary IO split, which usually hurts io_uring performance.
References
==========
@@ -332,5 +349,3 @@ References
.. [#userspace_nbdublk] https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk
.. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README
-
-.. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c060da409ed8..2fd05c1bd30b 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -74,13 +74,30 @@
#define UBLK_PARAM_TYPE_ALL \
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
- UBLK_PARAM_TYPE_DMA_ALIGN)
+ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
struct ublk_rq_data {
struct kref ref;
};
struct ublk_uring_cmd_pdu {
+ /*
+ * Store requests in same batch temporarily for queuing them to
+ * daemon context.
+ *
+ * It should have been stored to request payload, but we do want
+ * to avoid extra pre-allocation, and uring_cmd payload is always
+ * free for us
+ */
+ union {
+ struct request *req;
+ struct request *req_list;
+ };
+
+ /*
+ * The following two are valid in this cmd whole lifetime, and
+ * setup in ublk uring_cmd handler
+ */
struct ublk_queue *ubq;
u16 tag;
};
@@ -141,10 +158,8 @@ struct ublk_queue {
unsigned long flags;
struct task_struct *ubq_daemon;
- char *io_cmd_buf;
+ struct ublksrv_io_desc *io_cmd_buf;
- unsigned long io_addr; /* mapped vm address */
- unsigned int max_io_sz;
bool force_abort;
bool timeout;
bool canceling;
@@ -582,6 +597,18 @@ static int ublk_validate_params(const struct ublk_device *ub)
return -EINVAL;
}
+ if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
+ const struct ublk_param_segment *p = &ub->params.seg;
+
+ if (!is_power_of_2(p->seg_boundary_mask + 1))
+ return -EINVAL;
+
+ if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
+ return -EINVAL;
+ if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -598,6 +625,11 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
}
+static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
+{
+ return !ublk_support_user_copy(ubq);
+}
+
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
{
/*
@@ -674,11 +706,11 @@ static inline bool ublk_rq_has_data(const struct request *rq)
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag)
{
- return (struct ublksrv_io_desc *)
- &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
+ return &ubq->io_cmd_buf[tag];
}
-static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
+static inline struct ublksrv_io_desc *
+ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
{
return ublk_get_queue(ub, q_id)->io_cmd_buf;
}
@@ -925,7 +957,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq))
+ if (!ublk_need_map_io(ubq))
return rq_bytes;
/*
@@ -949,7 +981,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq))
+ if (!ublk_need_map_io(ubq))
return rq_bytes;
if (ublk_need_unmap_req(req)) {
@@ -1037,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
struct io_uring_cmd *ioucmd)
{
- return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
+ return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
}
static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
@@ -1155,14 +1187,11 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
-static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
+static void ublk_dispatch_req(struct ublk_queue *ubq,
+ struct request *req,
+ unsigned int issue_flags)
{
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- struct ublk_queue *ubq = pdu->ubq;
- int tag = pdu->tag;
- struct request *req = blk_mq_tag_to_rq(
- ubq->dev->tag_set.tags[ubq->q_id], tag);
+ int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
unsigned int mapped_bytes;
@@ -1237,11 +1266,49 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd,
ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
}
+static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_queue *ubq = pdu->ubq;
+
+ ublk_dispatch_req(ubq, pdu->req, issue_flags);
+}
+
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
{
- struct ublk_io *io = &ubq->ios[rq->tag];
+ struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->req = rq;
+ io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
+}
+
+static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct request *rq = pdu->req_list;
+ struct ublk_queue *ubq = pdu->ubq;
+ struct request *next;
+
+ do {
+ next = rq->rq_next;
+ rq->rq_next = NULL;
+ ublk_dispatch_req(ubq, rq, issue_flags);
+ rq = next;
+ } while (rq);
+}
+
+static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
+{
+ struct request *rq = rq_list_peek(l);
+ struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
+ pdu->req_list = rq;
+ rq_list_init(l);
+ io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
}
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
@@ -1282,21 +1349,12 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
return BLK_EH_RESET_TIMER;
}
-static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
+static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq)
{
- struct ublk_queue *ubq = hctx->driver_data;
- struct request *rq = bd->rq;
blk_status_t res;
- if (unlikely(ubq->fail_io)) {
+ if (unlikely(ubq->fail_io))
return BLK_STS_TARGET;
- }
-
- /* fill iod to slot in io cmd buffer */
- res = ublk_setup_iod(ubq, rq);
- if (unlikely(res != BLK_STS_OK))
- return BLK_STS_IOERR;
/* With recovery feature enabled, force_abort is set in
* ublk_stop_dev() before calling del_gendisk(). We have to
@@ -1310,17 +1368,68 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
return BLK_STS_IOERR;
+ if (unlikely(ubq->canceling))
+ return BLK_STS_IOERR;
+
+ /* fill iod to slot in io cmd buffer */
+ res = ublk_setup_iod(ubq, rq);
+ if (unlikely(res != BLK_STS_OK))
+ return BLK_STS_IOERR;
+
+ blk_mq_start_request(rq);
+ return BLK_STS_OK;
+}
+
+static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct ublk_queue *ubq = hctx->driver_data;
+ struct request *rq = bd->rq;
+ blk_status_t res;
+
+ res = ublk_prep_req(ubq, rq);
+ if (res != BLK_STS_OK)
+ return res;
+
+ /*
+ * ->canceling has to be handled after ->force_abort and ->fail_io
+ * is dealt with, otherwise this request may not be failed in case
+ * of recovery, and cause hang when deleting disk
+ */
if (unlikely(ubq->canceling)) {
__ublk_abort_rq(ubq, rq);
return BLK_STS_OK;
}
- blk_mq_start_request(bd->rq);
ublk_queue_cmd(ubq, rq);
-
return BLK_STS_OK;
}
+static void ublk_queue_rqs(struct rq_list *rqlist)
+{
+ struct rq_list requeue_list = { };
+ struct rq_list submit_list = { };
+ struct ublk_queue *ubq = NULL;
+ struct request *req;
+
+ while ((req = rq_list_pop(rqlist))) {
+ struct ublk_queue *this_q = req->mq_hctx->driver_data;
+
+ if (ubq && ubq != this_q && !rq_list_empty(&submit_list))
+ ublk_queue_cmd_list(ubq, &submit_list);
+ ubq = this_q;
+
+ if (ublk_prep_req(ubq, req) == BLK_STS_OK)
+ rq_list_add_tail(&submit_list, req);
+ else
+ rq_list_add_tail(&requeue_list, req);
+ }
+
+ if (ubq && !rq_list_empty(&submit_list))
+ ublk_queue_cmd_list(ubq, &submit_list);
+ *rqlist = requeue_list;
+}
+
static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
unsigned int hctx_idx)
{
@@ -1333,6 +1442,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
static const struct blk_mq_ops ublk_mq_ops = {
.queue_rq = ublk_queue_rq,
+ .queue_rqs = ublk_queue_rqs,
.init_hctx = ublk_init_hctx,
.timeout = ublk_timeout,
};
@@ -1446,17 +1556,27 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
}
}
+/* Must be called when queue is frozen */
+static bool ublk_mark_queue_canceling(struct ublk_queue *ubq)
+{
+ bool canceled;
+
+ spin_lock(&ubq->cancel_lock);
+ canceled = ubq->canceling;
+ if (!canceled)
+ ubq->canceling = true;
+ spin_unlock(&ubq->cancel_lock);
+
+ return canceled;
+}
+
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
{
+ bool was_canceled = ubq->canceling;
struct gendisk *disk;
- spin_lock(&ubq->cancel_lock);
- if (ubq->canceling) {
- spin_unlock(&ubq->cancel_lock);
+ if (was_canceled)
return false;
- }
- ubq->canceling = true;
- spin_unlock(&ubq->cancel_lock);
spin_lock(&ub->lock);
disk = ub->ub_disk;
@@ -1468,14 +1588,23 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
if (!disk)
return false;
- /* Now we are serialized with ublk_queue_rq() */
+ /*
+ * Now we are serialized with ublk_queue_rq()
+ *
+ * Make sure that ubq->canceling is set when queue is frozen,
+ * because ublk_queue_rq() has to rely on this flag for avoiding to
+ * touch completed uring_cmd
+ */
blk_mq_quiesce_queue(disk->queue);
- /* abort queue is for making forward progress */
- ublk_abort_queue(ub, ubq);
+ was_canceled = ublk_mark_queue_canceling(ubq);
+ if (!was_canceled) {
+ /* abort queue is for making forward progress */
+ ublk_abort_queue(ub, ubq);
+ }
blk_mq_unquiesce_queue(disk->queue);
put_device(disk_to_dev(disk));
- return true;
+ return !was_canceled;
}
static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
@@ -1845,7 +1974,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
goto out;
- if (!ublk_support_user_copy(ubq)) {
+ if (ublk_need_map_io(ubq)) {
/*
* FETCH_RQ has to provide IO buffer if NEED GET
* DATA is not enabled
@@ -1867,7 +1996,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
- if (!ublk_support_user_copy(ubq)) {
+ if (ublk_need_map_io(ubq)) {
/*
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
* NEED GET DATA is not enabled or it is Read IO.
@@ -2343,6 +2472,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
lim.dma_alignment = ub->params.dma.alignment;
+ if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
+ lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
+ lim.max_segment_size = ub->params.seg.max_segment_size;
+ lim.max_segments = ub->params.seg.max_segments;
+ }
+
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ba8f52d48b94..204b22a99c4b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -184,6 +184,12 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv,
((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \
bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))
+#define for_each_mp_bvec(bvl, bio_vec, iter, start) \
+ for (iter = (start); \
+ (iter).bi_size && \
+ ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \
+ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))
+
/* for iterating one bio from start to end */
#define BVEC_ITER_ALL_INIT (struct bvec_iter) \
{ \
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index e6723fa95160..0634a3de1782 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -21,7 +21,6 @@ struct io_uring_cmd {
struct io_uring_cmd_data {
void *op_data;
- struct io_uring_sqe sqes[2];
};
static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 7255b36b5cf6..583b86681c93 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -410,6 +410,29 @@ struct ublk_param_dma_align {
__u8 pad[4];
};
+#define UBLK_MIN_SEGMENT_SIZE 4096
+/*
+ * If any one of the three segment parameter is set as 0, the behavior is
+ * undefined.
+ */
+struct ublk_param_segment {
+ /*
+ * seg_boundary_mask + 1 needs to be power_of_2(), and the sum has
+ * to be >= UBLK_MIN_SEGMENT_SIZE(4096)
+ */
+ __u64 seg_boundary_mask;
+
+ /*
+ * max_segment_size could be override by virt_boundary_mask, so be
+ * careful when setting both.
+ *
+ * max_segment_size has to be >= UBLK_MIN_SEGMENT_SIZE(4096)
+ */
+ __u32 max_segment_size;
+ __u16 max_segments;
+ __u8 pad[2];
+};
+
struct ublk_params {
/*
* Total length of parameters, userspace has to set 'len' for both
@@ -423,6 +446,7 @@ struct ublk_params {
#define UBLK_PARAM_TYPE_DEVT (1 << 2)
#define UBLK_PARAM_TYPE_ZONED (1 << 3)
#define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4)
+#define UBLK_PARAM_TYPE_SEGMENT (1 << 5)
__u32 types; /* types of parameter included */
struct ublk_param_basic basic;
@@ -430,6 +454,7 @@ struct ublk_params {
struct ublk_param_devt devt;
struct ublk_param_zoned zoned;
struct ublk_param_dma_align dma;
+ struct ublk_param_segment seg;
};
#endif
diff --git a/io_uring/Kconfig b/io_uring/Kconfig
index 9e2a4beba1ef..4b949c42c0bf 100644
--- a/io_uring/Kconfig
+++ b/io_uring/Kconfig
@@ -5,6 +5,7 @@
config IO_URING_ZCRX
def_bool y
+ depends on IO_URING
depends on PAGE_POOL
depends on INET
depends on NET_RX_BUSY_POLL
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3ba49c628337..c6209fe44cb1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1141,10 +1141,9 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret);
}
-static inline void io_req_local_work_add(struct io_kiocb *req,
- struct io_ring_ctx *ctx,
- unsigned flags)
+static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
+ struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev;
struct llist_node *head;
@@ -1239,17 +1238,16 @@ static void io_req_normal_work_add(struct io_kiocb *req)
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
{
if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
- io_req_local_work_add(req, req->ctx, flags);
+ io_req_local_work_add(req, flags);
else
io_req_normal_work_add(req);
}
-void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
- unsigned flags)
+void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags)
{
- if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)))
+ if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)))
return;
- io_req_local_work_add(req, ctx, flags);
+ __io_req_task_work_add(req, flags);
}
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
@@ -1645,6 +1643,8 @@ io_req_flags_t io_file_get_flags(struct file *file)
{
io_req_flags_t res = 0;
+ BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1);
+
if (S_ISREG(file_inode(file)->i_mode))
res |= REQ_F_ISREG;
if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
@@ -1796,7 +1796,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
- if (req_ref_put_and_test(req)) {
+ if (req_ref_put_and_test_atomic(req)) {
if (req->flags & IO_REQ_LINK_FLAGS)
nxt = io_req_find_next(req);
io_free_req(req);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 87f883130286..e4050b2d0821 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -89,8 +89,7 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
-void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
- unsigned flags);
+void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags);
void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 0bbcbbcdebfd..50a958e9c921 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -38,8 +38,8 @@ static void io_double_unlock_ctx(struct io_ring_ctx *octx)
mutex_unlock(&octx->uring_lock);
}
-static int io_double_lock_ctx(struct io_ring_ctx *octx,
- unsigned int issue_flags)
+static int io_lock_external_ctx(struct io_ring_ctx *octx,
+ unsigned int issue_flags)
{
/*
* To ensure proper ordering between the two ctxs, we can only
@@ -93,13 +93,14 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
kmem_cache_free(req_cachep, req);
return -EOWNERDEAD;
}
+ req->opcode = IORING_OP_NOP;
req->cqe.user_data = user_data;
io_req_set_res(req, res, cflags);
percpu_ref_get(&ctx->refs);
req->ctx = ctx;
req->tctx = NULL;
req->io_task_work.func = io_msg_tw_complete;
- io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE);
+ io_req_task_work_add_remote(req, IOU_F_TWQ_LAZY_WAKE);
return 0;
}
@@ -154,7 +155,7 @@ static int __io_msg_ring_data(struct io_ring_ctx *target_ctx,
ret = -EOVERFLOW;
if (target_ctx->flags & IORING_SETUP_IOPOLL) {
- if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
+ if (unlikely(io_lock_external_ctx(target_ctx, issue_flags)))
return -EAGAIN;
}
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
@@ -199,7 +200,7 @@ static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flag
struct file *src_file = msg->src_file;
int ret;
- if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
+ if (unlikely(io_lock_external_ctx(target_ctx, issue_flags)))
return -EAGAIN;
ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
diff --git a/io_uring/net.c b/io_uring/net.c
index 8944eb679024..24040bc3916a 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -97,6 +97,11 @@ struct io_recvzc {
struct io_zcrx_ifq *ifq;
};
+static int io_sg_from_iter_iovec(struct sk_buff *skb,
+ struct iov_iter *from, size_t length);
+static int io_sg_from_iter(struct sk_buff *skb,
+ struct iov_iter *from, size_t length);
+
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
@@ -176,16 +181,6 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
return hdr;
}
-/* assign new iovec to kmsg, if we need to */
-static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg,
- struct iovec *iov)
-{
- if (iov) {
- req->flags |= REQ_F_NEED_CLEANUP;
- io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs);
- }
-}
-
static inline void io_mshot_prep_retry(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
@@ -217,7 +212,11 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg
&iomsg->msg.msg_iter, io_is_compat(req->ctx));
if (unlikely(ret < 0))
return ret;
- io_net_vec_assign(req, iomsg, iov);
+
+ if (iov) {
+ req->flags |= REQ_F_NEED_CLEANUP;
+ io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
+ }
return 0;
}
@@ -325,25 +324,6 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
return 0;
}
-static int io_sendmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct user_msghdr msg;
- int ret;
-
- ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
- if (unlikely(ret))
- return ret;
-
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
- ITER_SOURCE);
- /* save msg_control as sys_sendmsg() overwrites it */
- sr->msg_control = iomsg->msg.msg_control_user;
- return ret;
-}
-
void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
{
struct io_async_msghdr *io = req->async_data;
@@ -379,6 +359,8 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kmsg->msg.msg_name = &kmsg->addr;
kmsg->msg.msg_namelen = addr_len;
}
+ if (sr->flags & IORING_RECVSEND_FIXED_BUF)
+ return 0;
if (!io_do_buffer_select(req)) {
ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
&kmsg->msg.msg_iter);
@@ -392,31 +374,24 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
-
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
-
- return io_sendmsg_copy_hdr(req, kmsg);
-}
-
-static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *kmsg = req->async_data;
struct user_msghdr msg;
int ret;
- if (!(sr->flags & IORING_RECVSEND_FIXED_BUF))
- return io_sendmsg_setup(req, sqe);
-