summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:57:21 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:57:21 -0800
commit0c00ed308d0559fc216be0442a3df124e9e13533 (patch)
treea41c8509b8543ce8681d0aa9c06a9f94c2b6e458 /drivers/block
parent591beb0e3a03258ef9c01893a5209845799a7c33 (diff)
parent72f4d6fca699a1e35b39c5e5dacac2926d254135 (diff)
downloadlinux-0c00ed308d0559fc216be0442a3df124e9e13533.tar.gz
linux-0c00ed308d0559fc216be0442a3df124e9e13533.tar.bz2
linux-0c00ed308d0559fc216be0442a3df124e9e13533.zip
Merge tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe: - Support for batch request processing for ublk, improving the efficiency of the kernel/ublk server communication. This can yield nice 7-12% performance improvements - Support for integrity data for ublk - Various other ublk improvements and additions, including a ton of selftests additions and updated - Move the handling of blk-crypto software fallback from below the block layer to above it. This reduces the complexity of dealing with bio splitting - Series fixing a number of potential deadlocks in blk-mq related to the queue usage counter and writeback throttling and rq-qos debugfs handling - Add an async_depth queue attribute, to resolve a performance regression that's been around for a qhilw related to the scheduler depth handling - Only use task_work for IOPOLL completions on NVMe, if it is necessary to do so. An earlier fix for an issue resulted in all these completions being punted to task_work, to guarantee that completions were only run for a given io_uring ring when it was local to that ring. With the new changes, we can detect if it's necessary to use task_work or not, and avoid it if possible. - rnbd fixes: - Fix refcount underflow in device unmap path - Handle PREFLUSH and NOUNMAP flags properly in protocol - Fix server-side bi_size for special IOs - Zero response buffer before use - Fix trace format for flags - Add .release to rnbd_dev_ktype - MD pull requests via Yu Kuai - Fix raid5_run() to return error when log_init() fails - Fix IO hang with degraded array with llbitmap - Fix percpu_ref not resurrected on suspend timeout in llbitmap - Fix GPF in write_page caused by resize race - Fix NULL pointer dereference in process_metadata_update - Fix hang when stopping arrays with metadata through dm-raid - Fix any_working flag handling in raid10_sync_request - Refactor sync/recovery code path, improve error handling for badblocks, and remove unused recovery_disabled field - Consolidate mddev boolean fields into mddev_flags - Use mempool to allocate stripe_request_ctx and make sure max_sectors is not less than io_opt in raid5 - Fix return value of mddev_trylock - Fix memory leak in raid1_run() - Add Li Nan as mdraid reviewer - Move phys_vec definitions to the kernel types, mostly in preparation for some VFIO and RDMA changes - Improve the speed for secure erase for some devices - Various little rust updates - Various other minor fixes, improvements, and cleanups * tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (162 commits) blk-mq: ABI/sysfs-block: fix docs build warnings selftests: ublk: organize test directories by test ID block: decouple secure erase size limit from discard size limit block: remove redundant kill_bdev() call in set_blocksize() blk-mq: add documentation for new queue attribute async_dpeth block, bfq: convert to use request_queue->async_depth mq-deadline: covert to use request_queue->async_depth kyber: covert to use request_queue->async_depth blk-mq: add a new queue sysfs attribute async_depth blk-mq: factor out a helper blk_mq_limit_depth() blk-mq-sched: unify elevators checking for async requests block: convert nr_requests to unsigned int block: don't use strcpy to copy blockdev name blk-mq-debugfs: warn about possible deadlock blk-mq-debugfs: add missing debugfs_mutex in blk_mq_debugfs_register_hctxs() blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static blk-rq-qos: fix possible debugfs_mutex deadlock blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/brd.c3
-rw-r--r--drivers/block/loop.c2
-rw-r--r--drivers/block/null_blk/main.c4
-rw-r--r--drivers/block/rnbd/rnbd-clt-sysfs.c8
-rw-r--r--drivers/block/rnbd/rnbd-clt.c19
-rw-r--r--drivers/block/rnbd/rnbd-proto.h18
-rw-r--r--drivers/block/rnbd/rnbd-srv-trace.h22
-rw-r--r--drivers/block/rnbd/rnbd-srv.c36
-rw-r--r--drivers/block/rnull/configfs.rs3
-rw-r--r--drivers/block/rnull/rnull.rs3
-rw-r--r--drivers/block/ublk_drv.c1905
11 files changed, 1767 insertions, 256 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 9778259b30d4..a5104cf96609 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -247,8 +247,7 @@ MODULE_ALIAS("rd");
/* Legacy boot options - nonmodular */
static int __init ramdisk_size(char *str)
{
- rd_size = simple_strtol(str, NULL, 0);
- return 1;
+ return kstrtoul(str, 0, &rd_size) == 0;
}
__setup("ramdisk_size=", ramdisk_size);
#endif
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 32a3a5b13802..98789a5297f2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -969,7 +969,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
lim->features |= BLK_FEAT_WRITE_CACHE;
- if (backing_bdev && !bdev_nonrot(backing_bdev))
+ if (backing_bdev && bdev_rot(backing_bdev))
lim->features |= BLK_FEAT_ROTATIONAL;
lim->max_hw_discard_sectors = max_discard_sectors;
lim->max_write_zeroes_sectors = max_discard_sectors;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 4c0632ab4e1b..740a8ac42075 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -642,7 +642,7 @@ static void nullb_device_release(struct config_item *item)
null_free_dev(dev);
}
-static struct configfs_item_operations nullb_device_ops = {
+static const struct configfs_item_operations nullb_device_ops = {
.release = nullb_device_release,
};
@@ -749,7 +749,7 @@ static struct configfs_attribute *nullb_group_attrs[] = {
NULL,
};
-static struct configfs_group_operations nullb_group_ops = {
+static const struct configfs_group_operations nullb_group_ops = {
.make_group = nullb_group_make_group,
.drop_item = nullb_group_drop_item,
};
diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c
index 6ea7c12e3a87..144aea1466a4 100644
--- a/drivers/block/rnbd/rnbd-clt-sysfs.c
+++ b/drivers/block/rnbd/rnbd-clt-sysfs.c
@@ -475,9 +475,17 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev)
}
}
+static void rnbd_dev_release(struct kobject *kobj)
+{
+ struct rnbd_clt_dev *dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+
+ kfree(dev);
+}
+
static const struct kobj_type rnbd_dev_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = rnbd_dev_groups,
+ .release = rnbd_dev_release,
};
static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev)
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index d1c354636315..757df2896aeb 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -60,7 +60,9 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev)
kfree(dev->pathname);
rnbd_clt_put_sess(dev->sess);
mutex_destroy(&dev->lock);
- kfree(dev);
+
+ if (dev->kobj.state_initialized)
+ kobject_put(&dev->kobj);
}
static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
@@ -1517,7 +1519,7 @@ static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev)
return found;
}
-static void delete_dev(struct rnbd_clt_dev *dev)
+static void rnbd_delete_dev(struct rnbd_clt_dev *dev)
{
struct rnbd_clt_session *sess = dev->sess;
@@ -1638,7 +1640,7 @@ put_iu:
kfree(rsp);
rnbd_put_iu(sess, iu);
del_dev:
- delete_dev(dev);
+ rnbd_delete_dev(dev);
put_dev:
rnbd_clt_put_dev(dev);
put_sess:
@@ -1647,13 +1649,13 @@ put_sess:
return ERR_PTR(ret);
}
-static void destroy_gen_disk(struct rnbd_clt_dev *dev)
+static void rnbd_destroy_gen_disk(struct rnbd_clt_dev *dev)
{
del_gendisk(dev->gd);
put_disk(dev->gd);
}
-static void destroy_sysfs(struct rnbd_clt_dev *dev,
+static void rnbd_destroy_sysfs(struct rnbd_clt_dev *dev,
const struct attribute *sysfs_self)
{
rnbd_clt_remove_dev_symlink(dev);
@@ -1662,7 +1664,6 @@ static void destroy_sysfs(struct rnbd_clt_dev *dev,
/* To avoid deadlock firstly remove itself */
sysfs_remove_file_self(&dev->kobj, sysfs_self);
kobject_del(&dev->kobj);
- kobject_put(&dev->kobj);
}
}
@@ -1691,9 +1692,9 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
dev->dev_state = DEV_STATE_UNMAPPED;
mutex_unlock(&dev->lock);
- delete_dev(dev);
- destroy_sysfs(dev, sysfs_self);
- destroy_gen_disk(dev);
+ rnbd_delete_dev(dev);
+ rnbd_destroy_sysfs(dev, sysfs_self);
+ rnbd_destroy_gen_disk(dev);
if (was_mapped && sess->rtrs)
send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h
index 77360c2a6069..64f1cfe9f8ef 100644
--- a/drivers/block/rnbd/rnbd-proto.h
+++ b/drivers/block/rnbd/rnbd-proto.h
@@ -18,7 +18,7 @@
#include <rdma/ib.h>
#define RNBD_PROTO_VER_MAJOR 2
-#define RNBD_PROTO_VER_MINOR 0
+#define RNBD_PROTO_VER_MINOR 2
/* The default port number the RTRS server is listening on. */
#define RTRS_PORT 1234
@@ -197,6 +197,8 @@ struct rnbd_msg_io {
*
* @RNBD_F_SYNC: request is sync (sync write or read)
* @RNBD_F_FUA: forced unit access
+ * @RNBD_F_PREFLUSH: request for cache flush
+ * @RNBD_F_NOUNMAP: do not free blocks when zeroing
*/
enum rnbd_io_flags {
@@ -211,6 +213,8 @@ enum rnbd_io_flags {
/* Flags */
RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0),
RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1),
+ RNBD_F_PREFLUSH = 1<<(RNBD_OP_BITS + 2),
+ RNBD_F_NOUNMAP = 1<<(RNBD_OP_BITS + 3)
};
static inline u32 rnbd_op(u32 flags)
@@ -245,6 +249,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf)
break;
case RNBD_OP_WRITE_ZEROES:
bio_opf = REQ_OP_WRITE_ZEROES;
+
+ if (rnbd_opf & RNBD_F_NOUNMAP)
+ bio_opf |= REQ_NOUNMAP;
break;
default:
WARN(1, "Unknown RNBD type: %d (flags %d)\n",
@@ -258,6 +265,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf)
if (rnbd_opf & RNBD_F_FUA)
bio_opf |= REQ_FUA;
+ if (rnbd_opf & RNBD_F_PREFLUSH)
+ bio_opf |= REQ_PREFLUSH;
+
return bio_opf;
}
@@ -280,6 +290,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq)
break;
case REQ_OP_WRITE_ZEROES:
rnbd_opf = RNBD_OP_WRITE_ZEROES;
+
+ if (rq->cmd_flags & REQ_NOUNMAP)
+ rnbd_opf |= RNBD_F_NOUNMAP;
break;
case REQ_OP_FLUSH:
rnbd_opf = RNBD_OP_FLUSH;
@@ -297,6 +310,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq)
if (op_is_flush(rq->cmd_flags))
rnbd_opf |= RNBD_F_FUA;
+ if (rq->cmd_flags & REQ_PREFLUSH)
+ rnbd_opf |= RNBD_F_PREFLUSH;
+
return rnbd_opf;
}
diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h
index 89d0bcb17195..18ae2ed5537a 100644
--- a/drivers/block/rnbd/rnbd-srv-trace.h
+++ b/drivers/block/rnbd/rnbd-srv-trace.h
@@ -44,24 +44,6 @@ DEFINE_EVENT(rnbd_srv_link_class, name, \
DEFINE_LINK_EVENT(create_sess);
DEFINE_LINK_EVENT(destroy_sess);
-TRACE_DEFINE_ENUM(RNBD_OP_READ);
-TRACE_DEFINE_ENUM(RNBD_OP_WRITE);
-TRACE_DEFINE_ENUM(RNBD_OP_FLUSH);
-TRACE_DEFINE_ENUM(RNBD_OP_DISCARD);
-TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE);
-TRACE_DEFINE_ENUM(RNBD_F_SYNC);
-TRACE_DEFINE_ENUM(RNBD_F_FUA);
-
-#define show_rnbd_rw_flags(x) \
- __print_flags(x, "|", \
- { RNBD_OP_READ, "READ" }, \
- { RNBD_OP_WRITE, "WRITE" }, \
- { RNBD_OP_FLUSH, "FLUSH" }, \
- { RNBD_OP_DISCARD, "DISCARD" }, \
- { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \
- { RNBD_F_SYNC, "SYNC" }, \
- { RNBD_F_FUA, "FUA" })
-
TRACE_EVENT(process_rdma,
TP_PROTO(struct rnbd_srv_session *srv,
const struct rnbd_msg_io *msg,
@@ -97,7 +79,7 @@ TRACE_EVENT(process_rdma,
__entry->usrlen = usrlen;
),
- TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu",
+ TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %u, ioprio: %d, datalen: %u, usrlen: %zu",
__get_str(sessname),
__print_symbolic(__entry->dir,
{ READ, "READ" },
@@ -106,7 +88,7 @@ TRACE_EVENT(process_rdma,
__entry->device_id,
__entry->sector,
__entry->bi_size,
- show_rnbd_rw_flags(__entry->flags),
+ __entry->flags,
__entry->ioprio,
__entry->datalen,
__entry->usrlen
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 2df8941a6b14..7eeb321d6140 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -145,18 +145,30 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
priv->sess_dev = sess_dev;
priv->id = id;
- bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
+ bio = bio_alloc(file_bdev(sess_dev->bdev_file), !!datalen,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
- bio_add_virt_nofail(bio, data, datalen);
-
- bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
- if (bio_has_data(bio) &&
- bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) {
- rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n",
- bio->bi_iter.bi_size, msg->bi_size);
- err = -EINVAL;
- goto bio_put;
+ if (unlikely(!bio)) {
+ err = -ENOMEM;
+ goto put_sess_dev;
}
+
+ if (!datalen) {
+ /*
+ * For special requests like DISCARD and WRITE_ZEROES, the datalen is zero.
+ */
+ bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
+ } else {
+ bio_add_virt_nofail(bio, data, datalen);
+ bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
+ if (bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) {
+ rnbd_srv_err_rl(sess_dev,
+ "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n",
+ bio->bi_iter.bi_size, msg->bi_size);
+ err = -EINVAL;
+ goto bio_put;
+ }
+ }
+
bio->bi_end_io = rnbd_dev_bi_end_io;
bio->bi_private = priv;
bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
@@ -170,6 +182,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
bio_put:
bio_put(bio);
+put_sess_dev:
rnbd_put_sess_dev(sess_dev);
err:
kfree(priv);
@@ -538,6 +551,8 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
{
struct block_device *bdev = file_bdev(sess_dev->bdev_file);
+ memset(rsp, 0, sizeof(*rsp));
+
rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP);
rsp->device_id = cpu_to_le32(sess_dev->device_id);
rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev));
@@ -644,6 +659,7 @@ static void process_msg_sess_info(struct rnbd_srv_session *srv_sess,
trace_process_msg_sess_info(srv_sess, sess_info_msg);
+ memset(rsp, 0, sizeof(*rsp));
rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP);
rsp->ver = srv_sess->ver;
}
diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs
index 6713a6d92391..158f38bbbb8b 100644
--- a/drivers/block/rnull/configfs.rs
+++ b/drivers/block/rnull/configfs.rs
@@ -13,7 +13,6 @@ use kernel::{
str::{kstrtobool_bytes, CString},
sync::Mutex,
};
-use pin_init::PinInit;
pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, Error> {
let item_type = configfs_attrs! {
@@ -25,7 +24,7 @@ pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, E
],
};
- kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {}))
+ kernel::configfs::Subsystem::new(c"rnull", item_type, try_pin_init!(Config {}))
}
#[pin_data]
diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
index a9d5e575a2c4..0ca8715febe8 100644
--- a/drivers/block/rnull/rnull.rs
+++ b/drivers/block/rnull/rnull.rs
@@ -14,12 +14,9 @@ use kernel::{
Operations, TagSet,
},
},
- error::Result,
- pr_info,
prelude::*,
sync::{aref::ARef, Arc},
};
-use pin_init::PinInit;
module! {
type: NullBlkModule,
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index cd1e84653002..c13cda58a7c6 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -44,6 +44,9 @@
#include <linux/task_work.h>
#include <linux/namei.h>
#include <linux/kref.h>
+#include <linux/kfifo.h>
+#include <linux/blk-integrity.h>
+#include <uapi/linux/fs.h>
#include <uapi/linux/ublk_cmd.h>
#define UBLK_MINORS (1U << MINORBITS)
@@ -54,6 +57,7 @@
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
+#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -73,7 +77,11 @@
| UBLK_F_AUTO_BUF_REG \
| UBLK_F_QUIESCE \
| UBLK_F_PER_IO_DAEMON \
- | UBLK_F_BUF_REG_OFF_DAEMON)
+ | UBLK_F_BUF_REG_OFF_DAEMON \
+ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
+ | UBLK_F_SAFE_STOP_DEV \
+ | UBLK_F_BATCH_IO \
+ | UBLK_F_NO_AUTO_PART_SCAN)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -83,7 +91,20 @@
#define UBLK_PARAM_TYPE_ALL \
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
- UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
+ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
+ UBLK_PARAM_TYPE_INTEGRITY)
+
+#define UBLK_BATCH_F_ALL \
+ (UBLK_BATCH_F_HAS_ZONE_LBA | \
+ UBLK_BATCH_F_HAS_BUF_ADDR | \
+ UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
+
+/* ublk batch fetch uring_cmd */
+struct ublk_batch_fetch_cmd {
+ struct list_head node;
+ struct io_uring_cmd *cmd;
+ unsigned short buf_group;
+};
struct ublk_uring_cmd_pdu {
/*
@@ -105,7 +126,18 @@ struct ublk_uring_cmd_pdu {
*/
struct ublk_queue *ubq;
- u16 tag;
+ union {
+ u16 tag;
+ struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
+ };
+};
+
+struct ublk_batch_io_data {
+ struct ublk_device *ub;
+ struct io_uring_cmd *cmd;
+ struct ublk_batch_io header;
+ unsigned int issue_flags;
+ struct io_comp_batch *iob;
};
/*
@@ -155,6 +187,9 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
+/* used for UBLK_F_BATCH_IO only */
+#define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1)
+
union ublk_io_buf {
__u64 addr;
struct ublk_auto_buf_reg auto_reg;
@@ -179,7 +214,7 @@ struct ublk_io {
* if user copy or zero copy are enabled:
* - UBLK_REFCOUNT_INIT from dispatch to the server
* until UBLK_IO_COMMIT_AND_FETCH_REQ
- * - 1 for each inflight ublk_ch_{read,write}_iter() call
+ * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
* - 1 for each io_uring registered buffer not registered on task
* The I/O can only be completed once all references are dropped.
* User copy and buffer registration operations are only permitted
@@ -190,6 +225,7 @@ struct ublk_io {
unsigned task_registered_buffers;
void *buf_ctx_handle;
+ spinlock_t lock;
} ____cacheline_aligned_in_smp;
struct ublk_queue {
@@ -204,6 +240,52 @@ struct ublk_queue {
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
spinlock_t cancel_lock;
struct ublk_device *dev;
+ u32 nr_io_ready;
+
+ /*
+ * For supporting UBLK_F_BATCH_IO only.
+ *
+ * Inflight ublk request tag is saved in this fifo
+ *
+ * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
+ * so lock is required for storing request tag to fifo
+ *
+ * Make sure just one reader for fetching request from task work
+ * function to ublk server, so no need to grab the lock in reader
+ * side.
+ *
+ * Batch I/O State Management:
+ *
+ * The batch I/O system uses implicit state management based on the
+ * combination of three key variables below.
+ *
+ * - IDLE: list_empty(&fcmd_head) && !active_fcmd
+ * No fetch commands available, events queue in evts_fifo
+ *
+ * - READY: !list_empty(&fcmd_head) && !active_fcmd
+ * Fetch commands available but none processing events
+ *
+ * - ACTIVE: active_fcmd
+ * One fetch command actively processing events from evts_fifo
+ *
+ * Key Invariants:
+ * - At most one active_fcmd at any time (single reader)
+ * - active_fcmd is always from fcmd_head list when non-NULL
+ * - evts_fifo can be read locklessly by the single active reader
+ * - All state transitions require evts_lock protection
+ * - Multiple writers to evts_fifo require lock protection
+ */
+ struct {
+ DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
+ spinlock_t evts_lock;
+
+ /* List of fetch commands available to process events */
+ struct list_head fcmd_head;
+
+ /* Currently active fetch command (NULL = none active) */
+ struct ublk_batch_fetch_cmd *active_fcmd;
+ }____cacheline_aligned_in_smp;
+
struct ublk_io ios[] __counted_by(q_depth);
};
@@ -231,7 +313,7 @@ struct ublk_device {
struct ublk_params params;
struct completion completion;
- u32 nr_io_ready;
+ u32 nr_queue_ready;
bool unprivileged_daemons;
struct mutex cancel_mutex;
bool canceling;
@@ -239,6 +321,8 @@ struct ublk_device {
struct delayed_work exit_work;
struct work_struct partition_scan_work;
+ bool block_open; /* protected by open_mutex */
+
struct ublk_queue *queues[];
};
@@ -252,8 +336,51 @@ static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
+ u16 q_id, u16 tag, struct ublk_io *io);
static inline unsigned int ublk_req_build_flags(struct request *req);
+static void ublk_batch_dispatch(struct ublk_queue *ubq,
+ const struct ublk_batch_io_data *data,
+ struct ublk_batch_fetch_cmd *fcmd);
+
+static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_BATCH_IO;
+}
+
+static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_BATCH_IO;
+}
+
+static inline void ublk_io_lock(struct ublk_io *io)
+{
+ spin_lock(&io->lock);
+}
+
+static inline void ublk_io_unlock(struct ublk_io *io)
+{
+ spin_unlock(&io->lock);
+}
+
+/* Initialize the event queue */
+static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
+ int numa_node)
+{
+ spin_lock_init(&q->evts_lock);
+ return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
+}
+
+/* Check if event queue is empty */
+static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
+{
+ return kfifo_is_empty(&q->evts_fifo);
+}
+
+static inline void ublk_io_evts_deinit(struct ublk_queue *q)
+{
+ WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
+ kfifo_free(&q->evts_fifo);
+}
static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
@@ -261,6 +388,36 @@ ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
return &ubq->io_cmd_buf[tag];
}
+static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
+}
+
+static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
+}
+
+static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_AUTO_BUF_REG;
+}
+
+static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
+}
+
+static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_USER_COPY;
+}
+
+static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_USER_COPY;
+}
+
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
@@ -271,6 +428,11 @@ static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_ZONED;
}
+static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_INTEGRITY;
+}
+
#ifdef CONFIG_BLK_DEV_ZONED
struct ublk_zoned_report_desc {
@@ -532,7 +694,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
- bool need_map);
+ bool need_map, struct io_comp_batch *iob);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -545,6 +707,64 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
static DEFINE_MUTEX(ublk_ctl_mutex);
+static struct ublk_batch_fetch_cmd *
+ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
+{
+ struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO);
+
+ if (fcmd) {
+ fcmd->cmd = cmd;
+ fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
+ }
+ return fcmd;
+}
+
+static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
+{
+ kfree(fcmd);
+}
+
+static void __ublk_release_fcmd(struct ublk_queue *ubq)
+{
+ WRITE_ONCE(ubq->active_fcmd, NULL);
+}
+
+/*
+ * Nothing can move on, so clear ->active_fcmd, and the caller should stop
+ * dispatching
+ */
+static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
+ const struct ublk_batch_io_data *data,
+ struct ublk_batch_fetch_cmd *fcmd,
+ int res)
+{
+ spin_lock(&ubq->evts_lock);
+ list_del_init(&fcmd->node);
+ WARN_ON_ONCE(fcmd != ubq->active_fcmd);
+ __ublk_release_fcmd(ubq);
+ spin_unlock(&ubq->evts_lock);
+
+ io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
+ ublk_batch_free_fcmd(fcmd);
+}
+
+static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
+ struct io_br_sel *sel,
+ unsigned int issue_flags)
+{
+ if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
+ return -ENOBUFS;
+ return 0;
+}
+
+static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
+ void __user *buf, const u16 *tag_buf,
+ unsigned int len)
+{
+ if (copy_to_user(buf, tag_buf, len))
+ return -EFAULT;
+ return len;
+}
#define UBLK_MAX_UBLKS UBLK_MINORS
@@ -586,6 +806,53 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
set_capacity(ub->ub_disk, p->dev_sectors);
}
+static int ublk_integrity_flags(u32 flags)
+{
+ int ret_flags = 0;
+
+ if (flags & LBMD_PI_CAP_INTEGRITY) {
+ flags &= ~LBMD_PI_CAP_INTEGRITY;
+ ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ }
+ if (flags & LBMD_PI_CAP_REFTAG) {
+ flags &= ~LBMD_PI_CAP_REFTAG;
+ ret_flags |= BLK_INTEGRITY_REF_TAG;
+ }
+ return flags ? -EINVAL : ret_flags;
+}
+
+static int ublk_integrity_pi_tuple_size(u8 csum_type)
+{
+ switch (csum_type) {
+ case LBMD_PI_CSUM_NONE:
+ return 0;
+ case LBMD_PI_CSUM_IP:
+ case LBMD_PI_CSUM_CRC16_T10DIF:
+ return 8;
+ case LBMD_PI_CSUM_CRC64_NVME:
+ return 16;
+ default:
+ return -EINVAL;
+ }
+}
+
+static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
+{
+ switch (csum_type) {
+ case LBMD_PI_CSUM_NONE:
+ return BLK_INTEGRITY_CSUM_NONE;
+ case LBMD_PI_CSUM_IP:
+ return BLK_INTEGRITY_CSUM_IP;
+ case LBMD_PI_CSUM_CRC16_T10DIF:
+ return BLK_INTEGRITY_CSUM_CRC;
+ case LBMD_PI_CSUM_CRC64_NVME:
+ return BLK_INTEGRITY_CSUM_CRC64;
+ default:
+ WARN_ON_ONCE(1);
+ return BLK_INTEGRITY_CSUM_NONE;
+ }
+}
+
static int ublk_validate_params(const struct ublk_device *ub)
{
/* basic param is the only one which must be set */
@@ -648,6 +915,29 @@ static int ublk_validate_params(const struct ublk_device *ub)
return -EINVAL;
}
+ if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
+ const struct ublk_param_integrity *p = &ub->params.integrity;
+ int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
+ int flags = ublk_integrity_flags(p->flags);
+
+ if (!ublk_dev_support_integrity(ub))
+ return -EINVAL;
+ if (flags < 0)
+ return flags;
+ if (pi_tuple_size < 0)
+ return pi_tuple_size;
+ if (!p->metadata_size)
+ return -EINVAL;
+ if (p->csum_type == LBMD_PI_CSUM_NONE &&
+ p->flags & LBMD_PI_CAP_REFTAG)
+ return -EINVAL;
+ if (p->pi_offset + pi_tuple_size > p->metadata_size)
+ return -EINVAL;
+ if (p->interval_exp < SECTOR_SHIFT ||
+ p->interval_exp > ub->params.basic.logical_bs_shift)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -659,36 +949,6 @@ static void ublk_apply_params(struct ublk_device *ub)
ublk_dev_param_zoned_apply(ub);
}
-static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
-{
- return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
-}
-
-static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
-{
- return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
-}
-
-static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
-{
- return ubq->flags & UBLK_F_AUTO_BUF_REG;
-}
-
-static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
-{
- return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
-}
-
-static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
-{
- return ubq->flags & UBLK_F_USER_COPY;
-}
-
-static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
-{
- return ub->dev_info.flags & UBLK_F_USER_COPY;
-}
-
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
@@ -726,6 +986,95 @@ static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
ublk_dev_support_auto_buf_reg(ub);
}
+/*
+ * ublk IO Reference Counting Design
+ * ==================================
+ *
+ * For user-copy and zero-copy modes, ublk uses a split reference model with
+ * two counters that together track IO lifetime:
+ *
+ * - io->ref: refcount for off-task buffer registrations and user-copy ops
+ * - io->task_registered_buffers: count of buffers registered on the IO task
+ *
+ * Key Invariant:
+ * --------------
+ * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
+ * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
+ * when no active references exist. After IO completion, both counters become
+ * zero. For I/Os not currently dispatched to the ublk server,