diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 17:57:21 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 17:57:21 -0800 |
| commit | 0c00ed308d0559fc216be0442a3df124e9e13533 (patch) | |
| tree | a41c8509b8543ce8681d0aa9c06a9f94c2b6e458 /drivers/block | |
| parent | 591beb0e3a03258ef9c01893a5209845799a7c33 (diff) | |
| parent | 72f4d6fca699a1e35b39c5e5dacac2926d254135 (diff) | |
| download | linux-0c00ed308d0559fc216be0442a3df124e9e13533.tar.gz linux-0c00ed308d0559fc216be0442a3df124e9e13533.tar.bz2 linux-0c00ed308d0559fc216be0442a3df124e9e13533.zip | |
Merge tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe:
- Support for batch request processing for ublk, improving the
efficiency of the kernel/ublk server communication. This can yield
nice 7-12% performance improvements
- Support for integrity data for ublk
- Various other ublk improvements and additions, including a ton of
selftests additions and updated
- Move the handling of blk-crypto software fallback from below the
block layer to above it. This reduces the complexity of dealing with
bio splitting
- Series fixing a number of potential deadlocks in blk-mq related to
the queue usage counter and writeback throttling and rq-qos debugfs
handling
- Add an async_depth queue attribute, to resolve a performance
regression that's been around for a qhilw related to the scheduler
depth handling
- Only use task_work for IOPOLL completions on NVMe, if it is necessary
to do so. An earlier fix for an issue resulted in all these
completions being punted to task_work, to guarantee that completions
were only run for a given io_uring ring when it was local to that
ring. With the new changes, we can detect if it's necessary to use
task_work or not, and avoid it if possible.
- rnbd fixes:
- Fix refcount underflow in device unmap path
- Handle PREFLUSH and NOUNMAP flags properly in protocol
- Fix server-side bi_size for special IOs
- Zero response buffer before use
- Fix trace format for flags
- Add .release to rnbd_dev_ktype
- MD pull requests via Yu Kuai
- Fix raid5_run() to return error when log_init() fails
- Fix IO hang with degraded array with llbitmap
- Fix percpu_ref not resurrected on suspend timeout in llbitmap
- Fix GPF in write_page caused by resize race
- Fix NULL pointer dereference in process_metadata_update
- Fix hang when stopping arrays with metadata through dm-raid
- Fix any_working flag handling in raid10_sync_request
- Refactor sync/recovery code path, improve error handling for
badblocks, and remove unused recovery_disabled field
- Consolidate mddev boolean fields into mddev_flags
- Use mempool to allocate stripe_request_ctx and make sure
max_sectors is not less than io_opt in raid5
- Fix return value of mddev_trylock
- Fix memory leak in raid1_run()
- Add Li Nan as mdraid reviewer
- Move phys_vec definitions to the kernel types, mostly in preparation
for some VFIO and RDMA changes
- Improve the speed for secure erase for some devices
- Various little rust updates
- Various other minor fixes, improvements, and cleanups
* tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (162 commits)
blk-mq: ABI/sysfs-block: fix docs build warnings
selftests: ublk: organize test directories by test ID
block: decouple secure erase size limit from discard size limit
block: remove redundant kill_bdev() call in set_blocksize()
blk-mq: add documentation for new queue attribute async_dpeth
block, bfq: convert to use request_queue->async_depth
mq-deadline: covert to use request_queue->async_depth
kyber: covert to use request_queue->async_depth
blk-mq: add a new queue sysfs attribute async_depth
blk-mq: factor out a helper blk_mq_limit_depth()
blk-mq-sched: unify elevators checking for async requests
block: convert nr_requests to unsigned int
block: don't use strcpy to copy blockdev name
blk-mq-debugfs: warn about possible deadlock
blk-mq-debugfs: add missing debugfs_mutex in blk_mq_debugfs_register_hctxs()
blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos()
blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
blk-rq-qos: fix possible debugfs_mutex deadlock
blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
...
Diffstat (limited to 'drivers/block')
| -rw-r--r-- | drivers/block/brd.c | 3 | ||||
| -rw-r--r-- | drivers/block/loop.c | 2 | ||||
| -rw-r--r-- | drivers/block/null_blk/main.c | 4 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-clt-sysfs.c | 8 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-clt.c | 19 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-proto.h | 18 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-srv-trace.h | 22 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-srv.c | 36 | ||||
| -rw-r--r-- | drivers/block/rnull/configfs.rs | 3 | ||||
| -rw-r--r-- | drivers/block/rnull/rnull.rs | 3 | ||||
| -rw-r--r-- | drivers/block/ublk_drv.c | 1905 |
11 files changed, 1767 insertions, 256 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 9778259b30d4..a5104cf96609 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -247,8 +247,7 @@ MODULE_ALIAS("rd"); /* Legacy boot options - nonmodular */ static int __init ramdisk_size(char *str) { - rd_size = simple_strtol(str, NULL, 0); - return 1; + return kstrtoul(str, 0, &rd_size) == 0; } __setup("ramdisk_size=", ramdisk_size); #endif diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 32a3a5b13802..98789a5297f2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -969,7 +969,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim->features |= BLK_FEAT_WRITE_CACHE; - if (backing_bdev && !bdev_nonrot(backing_bdev)) + if (backing_bdev && bdev_rot(backing_bdev)) lim->features |= BLK_FEAT_ROTATIONAL; lim->max_hw_discard_sectors = max_discard_sectors; lim->max_write_zeroes_sectors = max_discard_sectors; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 4c0632ab4e1b..740a8ac42075 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -642,7 +642,7 @@ static void nullb_device_release(struct config_item *item) null_free_dev(dev); } -static struct configfs_item_operations nullb_device_ops = { +static const struct configfs_item_operations nullb_device_ops = { .release = nullb_device_release, }; @@ -749,7 +749,7 @@ static struct configfs_attribute *nullb_group_attrs[] = { NULL, }; -static struct configfs_group_operations nullb_group_ops = { +static const struct configfs_group_operations nullb_group_ops = { .make_group = nullb_group_make_group, .drop_item = nullb_group_drop_item, }; diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 6ea7c12e3a87..144aea1466a4 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -475,9 +475,17 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) } } +static void rnbd_dev_release(struct kobject *kobj) +{ + struct rnbd_clt_dev *dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + kfree(dev); +} + static const struct kobj_type rnbd_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = rnbd_dev_groups, + .release = rnbd_dev_release, }; static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index d1c354636315..757df2896aeb 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -60,7 +60,9 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); mutex_destroy(&dev->lock); - kfree(dev); + + if (dev->kobj.state_initialized) + kobject_put(&dev->kobj); } static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) @@ -1517,7 +1519,7 @@ static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev) return found; } -static void delete_dev(struct rnbd_clt_dev *dev) +static void rnbd_delete_dev(struct rnbd_clt_dev *dev) { struct rnbd_clt_session *sess = dev->sess; @@ -1638,7 +1640,7 @@ put_iu: kfree(rsp); rnbd_put_iu(sess, iu); del_dev: - delete_dev(dev); + rnbd_delete_dev(dev); put_dev: rnbd_clt_put_dev(dev); put_sess: @@ -1647,13 +1649,13 @@ put_sess: return ERR_PTR(ret); } -static void destroy_gen_disk(struct rnbd_clt_dev *dev) +static void rnbd_destroy_gen_disk(struct rnbd_clt_dev *dev) { del_gendisk(dev->gd); put_disk(dev->gd); } -static void destroy_sysfs(struct rnbd_clt_dev *dev, +static void rnbd_destroy_sysfs(struct rnbd_clt_dev *dev, const struct attribute *sysfs_self) { rnbd_clt_remove_dev_symlink(dev); @@ -1662,7 +1664,6 @@ static void destroy_sysfs(struct rnbd_clt_dev *dev, /* To avoid deadlock firstly remove itself */ sysfs_remove_file_self(&dev->kobj, sysfs_self); kobject_del(&dev->kobj); - kobject_put(&dev->kobj); } } @@ -1691,9 +1692,9 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, dev->dev_state = DEV_STATE_UNMAPPED; mutex_unlock(&dev->lock); - delete_dev(dev); - destroy_sysfs(dev, sysfs_self); - destroy_gen_disk(dev); + rnbd_delete_dev(dev); + rnbd_destroy_sysfs(dev, sysfs_self); + rnbd_destroy_gen_disk(dev); if (was_mapped && sess->rtrs) send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index 77360c2a6069..64f1cfe9f8ef 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -18,7 +18,7 @@ #include <rdma/ib.h> #define RNBD_PROTO_VER_MAJOR 2 -#define RNBD_PROTO_VER_MINOR 0 +#define RNBD_PROTO_VER_MINOR 2 /* The default port number the RTRS server is listening on. */ #define RTRS_PORT 1234 @@ -197,6 +197,8 @@ struct rnbd_msg_io { * * @RNBD_F_SYNC: request is sync (sync write or read) * @RNBD_F_FUA: forced unit access + * @RNBD_F_PREFLUSH: request for cache flush + * @RNBD_F_NOUNMAP: do not free blocks when zeroing */ enum rnbd_io_flags { @@ -211,6 +213,8 @@ enum rnbd_io_flags { /* Flags */ RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0), RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1), + RNBD_F_PREFLUSH = 1<<(RNBD_OP_BITS + 2), + RNBD_F_NOUNMAP = 1<<(RNBD_OP_BITS + 3) }; static inline u32 rnbd_op(u32 flags) @@ -245,6 +249,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) break; case RNBD_OP_WRITE_ZEROES: bio_opf = REQ_OP_WRITE_ZEROES; + + if (rnbd_opf & RNBD_F_NOUNMAP) + bio_opf |= REQ_NOUNMAP; break; default: WARN(1, "Unknown RNBD type: %d (flags %d)\n", @@ -258,6 +265,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) if (rnbd_opf & RNBD_F_FUA) bio_opf |= REQ_FUA; + if (rnbd_opf & RNBD_F_PREFLUSH) + bio_opf |= REQ_PREFLUSH; + return bio_opf; } @@ -280,6 +290,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) break; case REQ_OP_WRITE_ZEROES: rnbd_opf = RNBD_OP_WRITE_ZEROES; + + if (rq->cmd_flags & REQ_NOUNMAP) + rnbd_opf |= RNBD_F_NOUNMAP; break; case REQ_OP_FLUSH: rnbd_opf = RNBD_OP_FLUSH; @@ -297,6 +310,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) if (op_is_flush(rq->cmd_flags)) rnbd_opf |= RNBD_F_FUA; + if (rq->cmd_flags & REQ_PREFLUSH) + rnbd_opf |= RNBD_F_PREFLUSH; + return rnbd_opf; } diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h index 89d0bcb17195..18ae2ed5537a 100644 --- a/drivers/block/rnbd/rnbd-srv-trace.h +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -44,24 +44,6 @@ DEFINE_EVENT(rnbd_srv_link_class, name, \ DEFINE_LINK_EVENT(create_sess); DEFINE_LINK_EVENT(destroy_sess); -TRACE_DEFINE_ENUM(RNBD_OP_READ); -TRACE_DEFINE_ENUM(RNBD_OP_WRITE); -TRACE_DEFINE_ENUM(RNBD_OP_FLUSH); -TRACE_DEFINE_ENUM(RNBD_OP_DISCARD); -TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE); -TRACE_DEFINE_ENUM(RNBD_F_SYNC); -TRACE_DEFINE_ENUM(RNBD_F_FUA); - -#define show_rnbd_rw_flags(x) \ - __print_flags(x, "|", \ - { RNBD_OP_READ, "READ" }, \ - { RNBD_OP_WRITE, "WRITE" }, \ - { RNBD_OP_FLUSH, "FLUSH" }, \ - { RNBD_OP_DISCARD, "DISCARD" }, \ - { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \ - { RNBD_F_SYNC, "SYNC" }, \ - { RNBD_F_FUA, "FUA" }) - TRACE_EVENT(process_rdma, TP_PROTO(struct rnbd_srv_session *srv, const struct rnbd_msg_io *msg, @@ -97,7 +79,7 @@ TRACE_EVENT(process_rdma, __entry->usrlen = usrlen; ), - TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu", + TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %u, ioprio: %d, datalen: %u, usrlen: %zu", __get_str(sessname), __print_symbolic(__entry->dir, { READ, "READ" }, @@ -106,7 +88,7 @@ TRACE_EVENT(process_rdma, __entry->device_id, __entry->sector, __entry->bi_size, - show_rnbd_rw_flags(__entry->flags), + __entry->flags, __entry->ioprio, __entry->datalen, __entry->usrlen diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 2df8941a6b14..7eeb321d6140 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -145,18 +145,30 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1, + bio = bio_alloc(file_bdev(sess_dev->bdev_file), !!datalen, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); - bio_add_virt_nofail(bio, data, datalen); - - bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); - if (bio_has_data(bio) && - bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { - rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", - bio->bi_iter.bi_size, msg->bi_size); - err = -EINVAL; - goto bio_put; + if (unlikely(!bio)) { + err = -ENOMEM; + goto put_sess_dev; } + + if (!datalen) { + /* + * For special requests like DISCARD and WRITE_ZEROES, the datalen is zero. + */ + bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); + } else { + bio_add_virt_nofail(bio, data, datalen); + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + if (bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { + rnbd_srv_err_rl(sess_dev, + "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", + bio->bi_iter.bi_size, msg->bi_size); + err = -EINVAL; + goto bio_put; + } + } + bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); @@ -170,6 +182,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, bio_put: bio_put(bio); +put_sess_dev: rnbd_put_sess_dev(sess_dev); err: kfree(priv); @@ -538,6 +551,8 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, { struct block_device *bdev = file_bdev(sess_dev->bdev_file); + memset(rsp, 0, sizeof(*rsp)); + rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); rsp->device_id = cpu_to_le32(sess_dev->device_id); rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev)); @@ -644,6 +659,7 @@ static void process_msg_sess_info(struct rnbd_srv_session *srv_sess, trace_process_msg_sess_info(srv_sess, sess_info_msg); + memset(rsp, 0, sizeof(*rsp)); rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->ver = srv_sess->ver; } diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs index 6713a6d92391..158f38bbbb8b 100644 --- a/drivers/block/rnull/configfs.rs +++ b/drivers/block/rnull/configfs.rs @@ -13,7 +13,6 @@ use kernel::{ str::{kstrtobool_bytes, CString}, sync::Mutex, }; -use pin_init::PinInit; pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, Error> { let item_type = configfs_attrs! { @@ -25,7 +24,7 @@ pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, E ], }; - kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {})) + kernel::configfs::Subsystem::new(c"rnull", item_type, try_pin_init!(Config {})) } #[pin_data] diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index a9d5e575a2c4..0ca8715febe8 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -14,12 +14,9 @@ use kernel::{ Operations, TagSet, }, }, - error::Result, - pr_info, prelude::*, sync::{aref::ARef, Arc}, }; -use pin_init::PinInit; module! { type: NullBlkModule, diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index cd1e84653002..c13cda58a7c6 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -44,6 +44,9 @@ #include <linux/task_work.h> #include <linux/namei.h> #include <linux/kref.h> +#include <linux/kfifo.h> +#include <linux/blk-integrity.h> +#include <uapi/linux/fs.h> #include <uapi/linux/ublk_cmd.h> #define UBLK_MINORS (1U << MINORBITS) @@ -54,6 +57,7 @@ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) +#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -73,7 +77,11 @@ | UBLK_F_AUTO_BUF_REG \ | UBLK_F_QUIESCE \ | UBLK_F_PER_IO_DAEMON \ - | UBLK_F_BUF_REG_OFF_DAEMON) + | UBLK_F_BUF_REG_OFF_DAEMON \ + | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ + | UBLK_F_SAFE_STOP_DEV \ + | UBLK_F_BATCH_IO \ + | UBLK_F_NO_AUTO_PART_SCAN) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -83,7 +91,20 @@ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ - UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ + UBLK_PARAM_TYPE_INTEGRITY) + +#define UBLK_BATCH_F_ALL \ + (UBLK_BATCH_F_HAS_ZONE_LBA | \ + UBLK_BATCH_F_HAS_BUF_ADDR | \ + UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) + +/* ublk batch fetch uring_cmd */ +struct ublk_batch_fetch_cmd { + struct list_head node; + struct io_uring_cmd *cmd; + unsigned short buf_group; +}; struct ublk_uring_cmd_pdu { /* @@ -105,7 +126,18 @@ struct ublk_uring_cmd_pdu { */ struct ublk_queue *ubq; - u16 tag; + union { + u16 tag; + struct ublk_batch_fetch_cmd *fcmd; /* batch io only */ + }; +}; + +struct ublk_batch_io_data { + struct ublk_device *ub; + struct io_uring_cmd *cmd; + struct ublk_batch_io header; + unsigned int issue_flags; + struct io_comp_batch *iob; }; /* @@ -155,6 +187,9 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) +/* used for UBLK_F_BATCH_IO only */ +#define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1) + union ublk_io_buf { __u64 addr; struct ublk_auto_buf_reg auto_reg; @@ -179,7 +214,7 @@ struct ublk_io { * if user copy or zero copy are enabled: * - UBLK_REFCOUNT_INIT from dispatch to the server * until UBLK_IO_COMMIT_AND_FETCH_REQ - * - 1 for each inflight ublk_ch_{read,write}_iter() call + * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task * - 1 for each io_uring registered buffer not registered on task * The I/O can only be completed once all references are dropped. * User copy and buffer registration operations are only permitted @@ -190,6 +225,7 @@ struct ublk_io { unsigned task_registered_buffers; void *buf_ctx_handle; + spinlock_t lock; } ____cacheline_aligned_in_smp; struct ublk_queue { @@ -204,6 +240,52 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; + u32 nr_io_ready; + + /* + * For supporting UBLK_F_BATCH_IO only. + * + * Inflight ublk request tag is saved in this fifo + * + * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(), + * so lock is required for storing request tag to fifo + * + * Make sure just one reader for fetching request from task work + * function to ublk server, so no need to grab the lock in reader + * side. + * + * Batch I/O State Management: + * + * The batch I/O system uses implicit state management based on the + * combination of three key variables below. + * + * - IDLE: list_empty(&fcmd_head) && !active_fcmd + * No fetch commands available, events queue in evts_fifo + * + * - READY: !list_empty(&fcmd_head) && !active_fcmd + * Fetch commands available but none processing events + * + * - ACTIVE: active_fcmd + * One fetch command actively processing events from evts_fifo + * + * Key Invariants: + * - At most one active_fcmd at any time (single reader) + * - active_fcmd is always from fcmd_head list when non-NULL + * - evts_fifo can be read locklessly by the single active reader + * - All state transitions require evts_lock protection + * - Multiple writers to evts_fifo require lock protection + */ + struct { + DECLARE_KFIFO_PTR(evts_fifo, unsigned short); + spinlock_t evts_lock; + + /* List of fetch commands available to process events */ + struct list_head fcmd_head; + + /* Currently active fetch command (NULL = none active) */ + struct ublk_batch_fetch_cmd *active_fcmd; + }____cacheline_aligned_in_smp; + struct ublk_io ios[] __counted_by(q_depth); }; @@ -231,7 +313,7 @@ struct ublk_device { struct ublk_params params; struct completion completion; - u32 nr_io_ready; + u32 nr_queue_ready; bool unprivileged_daemons; struct mutex cancel_mutex; bool canceling; @@ -239,6 +321,8 @@ struct ublk_device { struct delayed_work exit_work; struct work_struct partition_scan_work; + bool block_open; /* protected by open_mutex */ + struct ublk_queue *queues[]; }; @@ -252,8 +336,51 @@ static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - u16 q_id, u16 tag, struct ublk_io *io, size_t offset); + u16 q_id, u16 tag, struct ublk_io *io); static inline unsigned int ublk_req_build_flags(struct request *req); +static void ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd); + +static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_BATCH_IO; +} + +static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_BATCH_IO; +} + +static inline void ublk_io_lock(struct ublk_io *io) +{ + spin_lock(&io->lock); +} + +static inline void ublk_io_unlock(struct ublk_io *io) +{ + spin_unlock(&io->lock); +} + +/* Initialize the event queue */ +static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size, + int numa_node) +{ + spin_lock_init(&q->evts_lock); + return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node); +} + +/* Check if event queue is empty */ +static inline bool ublk_io_evts_empty(const struct ublk_queue *q) +{ + return kfifo_is_empty(&q->evts_fifo); +} + +static inline void ublk_io_evts_deinit(struct ublk_queue *q) +{ + WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo)); + kfifo_free(&q->evts_fifo); +} static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) @@ -261,6 +388,36 @@ ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) return &ubq->io_cmd_buf[tag]; } +static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_ZONED; @@ -271,6 +428,11 @@ static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_ZONED; } +static inline bool ublk_dev_support_integrity(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_INTEGRITY; +} + #ifdef CONFIG_BLK_DEV_ZONED struct ublk_zoned_report_desc { @@ -532,7 +694,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, - bool need_map); + bool need_map, struct io_comp_batch *iob); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -545,6 +707,64 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static struct ublk_batch_fetch_cmd * +ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd) +{ + struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO); + + if (fcmd) { + fcmd->cmd = cmd; + fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index); + } + return fcmd; +} + +static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd) +{ + kfree(fcmd); +} + +static void __ublk_release_fcmd(struct ublk_queue *ubq) +{ + WRITE_ONCE(ubq->active_fcmd, NULL); +} + +/* + * Nothing can move on, so clear ->active_fcmd, and the caller should stop + * dispatching + */ +static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd, + int res) +{ + spin_lock(&ubq->evts_lock); + list_del_init(&fcmd->node); + WARN_ON_ONCE(fcmd != ubq->active_fcmd); + __ublk_release_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); + ublk_batch_free_fcmd(fcmd); +} + +static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd, + struct io_br_sel *sel, + unsigned int issue_flags) +{ + if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags)) + return -ENOBUFS; + return 0; +} + +static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd, + void __user *buf, const u16 *tag_buf, + unsigned int len) +{ + if (copy_to_user(buf, tag_buf, len)) + return -EFAULT; + return len; +} #define UBLK_MAX_UBLKS UBLK_MINORS @@ -586,6 +806,53 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) set_capacity(ub->ub_disk, p->dev_sectors); } +static int ublk_integrity_flags(u32 flags) +{ + int ret_flags = 0; + + if (flags & LBMD_PI_CAP_INTEGRITY) { + flags &= ~LBMD_PI_CAP_INTEGRITY; + ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + } + if (flags & LBMD_PI_CAP_REFTAG) { + flags &= ~LBMD_PI_CAP_REFTAG; + ret_flags |= BLK_INTEGRITY_REF_TAG; + } + return flags ? -EINVAL : ret_flags; +} + +static int ublk_integrity_pi_tuple_size(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return 0; + case LBMD_PI_CSUM_IP: + case LBMD_PI_CSUM_CRC16_T10DIF: + return 8; + case LBMD_PI_CSUM_CRC64_NVME: + return 16; + default: + return -EINVAL; + } +} + +static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return BLK_INTEGRITY_CSUM_NONE; + case LBMD_PI_CSUM_IP: + return BLK_INTEGRITY_CSUM_IP; + case LBMD_PI_CSUM_CRC16_T10DIF: + return BLK_INTEGRITY_CSUM_CRC; + case LBMD_PI_CSUM_CRC64_NVME: + return BLK_INTEGRITY_CSUM_CRC64; + default: + WARN_ON_ONCE(1); + return BLK_INTEGRITY_CSUM_NONE; + } +} + static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ @@ -648,6 +915,29 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { + const struct ublk_param_integrity *p = &ub->params.integrity; + int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); + int flags = ublk_integrity_flags(p->flags); + + if (!ublk_dev_support_integrity(ub)) + return -EINVAL; + if (flags < 0) + return flags; + if (pi_tuple_size < 0) + return pi_tuple_size; + if (!p->metadata_size) + return -EINVAL; + if (p->csum_type == LBMD_PI_CSUM_NONE && + p->flags & LBMD_PI_CAP_REFTAG) + return -EINVAL; + if (p->pi_offset + pi_tuple_size > p->metadata_size) + return -EINVAL; + if (p->interval_exp < SECTOR_SHIFT || + p->interval_exp > ub->params.basic.logical_bs_shift) + return -EINVAL; + } + return 0; } @@ -659,36 +949,6 @@ static void ublk_apply_params(struct ublk_device *ub) ublk_dev_param_zoned_apply(ub); } -static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; -} - -static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; -} - -static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_AUTO_BUF_REG; -} - -static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; -} - -static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_USER_COPY; -} - -static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_USER_COPY; -} - static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && @@ -726,6 +986,95 @@ static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub) ublk_dev_support_auto_buf_reg(ub); } +/* + * ublk IO Reference Counting Design + * ================================== + * + * For user-copy and zero-copy modes, ublk uses a split reference model with + * two counters that together track IO lifetime: + * + * - io->ref: refcount for off-task buffer registrations and user-copy ops + * - io->task_registered_buffers: count of buffers registered on the IO task + * + * Key Invariant: + * -------------- + * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set), + * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT + * when no active references exist. After IO completion, both counters become + * zero. For I/Os not currently dispatched to the ublk server, |
