summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 13:03:54 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 13:03:54 -0700
commit0c9f4ac808b017a0013cee92a30de980550145d5 (patch)
tree94eedbb9ef4815df9dc8d1dd6424fc92a2fbcd7a /block
parent9961a785944601e32f185ea696347b22ffda634c (diff)
parenta3166c51702bb00b8f8b84022090cbab8f37be1a (diff)
downloadlinux-0c9f4ac808b017a0013cee92a30de980550145d5.tar.gz
linux-0c9f4ac808b017a0013cee92a30de980550145d5.tar.bz2
linux-0c9f4ac808b017a0013cee92a30de980550145d5.zip
Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - Add a partscan attribute in sysfs, fixing an issue with systemd relying on an internal interface that went away. - Attempt #2 at making long running discards interruptible. The previous attempt went into 6.9, but we ended up mostly reverting it as it had issues. - Remove old ida_simple API in bcache - Support for zoned write plugging, greatly improving the performance on zoned devices. - Remove the old throttle low interface, which has been experimental since 2017 and never made it beyond that and isn't being used. - Remove page->index debugging checks in brd, as it hasn't caught anything and prepares us for removing in struct page. - MD pull request from Song - Don't schedule block workers on isolated CPUs * tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux: (84 commits) blk-throttle: delay initialization until configuration blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW block: fix that util can be greater than 100% block: support to account io_ticks precisely block: add plug while submitting IO bcache: fix variable length array abuse in btree_iter bcache: Remove usage of the deprecated ida_simple_xx() API md: Revert "md: Fix overflow in is_mddev_idle" blk-lib: check for kill signal in ioctl BLKDISCARD block: add a bio_await_chain helper block: add a blk_alloc_discard_bio helper block: add a bio_chain_and_submit helper block: move discard checks into the ioctl handler block: remove the discard_granularity check in __blkdev_issue_discard block/ioctl: prefer different overflow check null_blk: Fix the WARNING: modpost: missing MODULE_DESCRIPTION() block: fix and simplify blkdevparts= cmdline parsing block: refine the EOF check in blkdev_iomap_begin block: add a partscan sysfs attribute for disks block: add a disk_has_partscan helper ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig16
-rw-r--r--block/Makefile1
-rw-r--r--block/bio.c50
-rw-r--r--block/blk-cgroup-rwstat.c18
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-core.c26
-rw-r--r--block/blk-flush.c2
-rw-r--r--block/blk-lib.c68
-rw-r--r--block/blk-merge.c25
-rw-r--r--block/blk-mq-debugfs-zoned.c22
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-debugfs.h6
-rw-r--r--block/blk-mq.c184
-rw-r--r--block/blk-mq.h31
-rw-r--r--block/blk-settings.c46
-rw-r--r--block/blk-stat.c3
-rw-r--r--block/blk-sysfs.c10
-rw-r--r--block/blk-throttle.c1019
-rw-r--r--block/blk-throttle.h46
-rw-r--r--block/blk-zoned.c1508
-rw-r--r--block/blk.h97
-rw-r--r--block/elevator.c46
-rw-r--r--block/elevator.h1
-rw-r--r--block/fops.c31
-rw-r--r--block/genhd.c32
-rw-r--r--block/ioctl.c42
-rw-r--r--block/mq-deadline.c204
-rw-r--r--block/partitions/cmdline.c49
-rw-r--r--block/partitions/core.c5
29 files changed, 1959 insertions, 1641 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..dc12af58dbae 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED
config BLK_DEV_ZONED
bool "Zoned block device support"
- select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
@@ -120,17 +119,6 @@ config BLK_DEV_THROTTLING
See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
-config BLK_DEV_THROTTLING_LOW
- bool "Block throttling .low limit interface support (EXPERIMENTAL)"
- depends on BLK_DEV_THROTTLING
- help
- Add .low limit interface for block throttling. The low limit is a best
- effort limit to prioritize cgroups. Depending on the setting, the limit
- can be used to protect cgroups in terms of bandwidth/iops and better
- utilize disk resource.
-
- Note, this is an experimental interface and could be changed someday.
-
config BLK_WBT
bool "Enable support for block device writeback throttling"
help
@@ -198,10 +186,6 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
-config BLK_DEBUG_FS_ZONED
- bool
- default BLK_DEBUG_FS && BLK_DEV_ZONED
-
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..168150b9c510 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
-obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
diff --git a/block/bio.c b/block/bio.c
index d24420ed1c4c..53f608028c78 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -345,18 +345,29 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
-struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
- unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+/**
+ * bio_chain_and_submit - submit a bio after chaining it to another one
+ * @prev: bio to chain and submit
+ * @new: bio to chain to
+ *
+ * If @prev is non-NULL, chain it to @new and submit it.
+ *
+ * Return: @new.
+ */
+struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
{
- struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
-
- if (bio) {
- bio_chain(bio, new);
- submit_bio(bio);
+ if (prev) {
+ bio_chain(prev, new);
+ submit_bio(prev);
}
-
return new;
}
+
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+{
+ return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
+}
EXPORT_SYMBOL_GPL(blk_next_bio);
static void bio_alloc_rescue(struct work_struct *work)
@@ -1384,6 +1395,26 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
+static void bio_wait_end_io(struct bio *bio)
+{
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * bio_await_chain - ends @bio and waits for every chained bio to complete
+ */
+void bio_await_chain(struct bio *bio)
+{
+ DECLARE_COMPLETION_ONSTACK_MAP(done,
+ bio->bi_bdev->bd_disk->lockdep_map);
+
+ bio->bi_private = &done;
+ bio->bi_end_io = bio_wait_end_io;
+ bio_endio(bio);
+ blk_wait_io(&done);
+}
+
void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
@@ -1576,6 +1607,8 @@ again:
if (!bio_integrity_endio(bio))
return;
+ blk_zone_bio_endio(bio);
+
rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
@@ -1596,7 +1629,6 @@ again:
goto again;
}
- blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);
if (bio->bi_end_io)
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index 3304e841df7c..a55fb0c53558 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
{
int i, ret;
- for (i = 0; i < BLKG_RWSTAT_NR; i++) {
- ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
- if (ret) {
- while (--i >= 0)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
- return ret;
- }
+ ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
atomic64_set(&rwstat->aux_cnt[i], 0);
- }
return 0;
}
EXPORT_SYMBOL_GPL(blkg_rwstat_init);
void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
{
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR);
}
EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 059467086b13..4b1a35ab0ea4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work)
/* as long as there are pending bios, @blkg can't go away */
spin_lock(&blkg->async_bio_lock);
- bio_list_merge(&bios, &blkg->async_bios);
- bio_list_init(&blkg->async_bios);
+ bio_list_merge_init(&bios, &blkg->async_bios);
spin_unlock(&blkg->async_bio_lock);
/* start plug only when bio_list contains at least 2 bios */
@@ -1444,14 +1443,8 @@ int blkcg_init_disk(struct gendisk *disk)
if (ret)
goto err_destroy_all;
- ret = blk_throtl_init(disk);
- if (ret)
- goto err_ioprio_exit;
-
return 0;
-err_ioprio_exit:
- blk_ioprio_exit(disk);
err_destroy_all:
blkg_destroy_all(disk);
return ret;
diff --git a/block/blk-core.c b/block/blk-core.c
index b795ac177281..01186333c88e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -591,8 +591,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
- !bio_zone_is_seq(bio))
+ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
return BLK_STS_IOERR;
/*
@@ -604,7 +603,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
- if (nr_sectors > q->limits.max_zone_append_sectors)
+ if (nr_sectors > queue_max_zone_append_sectors(q))
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
@@ -649,11 +648,13 @@ static void __submit_bio(struct bio *bio)
static void __submit_bio_noacct(struct bio *bio)
{
struct bio_list bio_list_on_stack[2];
+ struct blk_plug plug;
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
+ blk_start_plug(&plug);
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -687,19 +688,23 @@ static void __submit_bio_noacct(struct bio *bio)
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+ blk_finish_plug(&plug);
current->bio_list = NULL;
}
static void __submit_bio_noacct_mq(struct bio *bio)
{
struct bio_list bio_list[2] = { };
+ struct blk_plug plug;
current->bio_list = bio_list;
+ blk_start_plug(&plug);
do {
__submit_bio(bio);
} while ((bio = bio_list_pop(&bio_list[0])));
+ blk_finish_plug(&plug);
current->bio_list = NULL;
}
@@ -910,12 +915,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- /*
- * As the requests that require a zone lock are not plugged in the
- * first place, directly accessing the plug instead of using
- * blk_mq_plug() should not have any consequences during flushing for
- * zoned devices.
- */
blk_flush_plug(current->plug, false);
/*
@@ -987,10 +986,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
unsigned long stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
- if (unlikely(time_after(now, stamp))) {
- if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
- __part_stat_add(part, io_ticks, end ? now - stamp : 1);
- }
+ if (unlikely(time_after(now, stamp)) &&
+ likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+ (end || part_in_flight(part)))
+ __part_stat_add(part, io_ticks, now - stamp);
+
if (part->bd_partno) {
part = bdev_whole(part);
goto again;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b0f314f4bc14..c17cf8ed8113 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,6 +130,8 @@ static void blk_flush_restore_request(struct request *rq)
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
+ if (rq->bio)
+ rq->__sector = rq->bio->bi_iter.bi_sector;
/* make @rq a normal request */
rq->rq_flags &= ~RQF_FLUSH_SEQ;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index a6954eafb8c8..442da9dad042 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -35,51 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
-int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+struct bio *blk_alloc_discard_bio(struct block_device *bdev,
+ sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask)
{
- struct bio *bio = *biop;
- sector_t bs_mask;
-
- if (bdev_read_only(bdev))
- return -EPERM;
- if (!bdev_max_discard_sectors(bdev))
- return -EOPNOTSUPP;
-
- /* In case the discard granularity isn't set by buggy device driver */
- if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
- pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
- bdev);
- return -EOPNOTSUPP;
- }
-
- bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- if ((sector | nr_sects) & bs_mask)
- return -EINVAL;
+ sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector));
+ struct bio *bio;
- if (!nr_sects)
- return -EINVAL;
+ if (!bio_sects)
+ return NULL;
- while (nr_sects) {
- sector_t req_sects =
- min(nr_sects, bio_discard_limit(bdev, sector));
+ bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask);
+ if (!bio)
+ return NULL;
+ bio->bi_iter.bi_sector = *sector;
+ bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT;
+ *sector += bio_sects;
+ *nr_sects -= bio_sects;
+ /*
+ * We can loop for a long time in here if someone does full device
+ * discards (like mkfs). Be nice and allow us to schedule out to avoid
+ * softlocking if preempt is disabled.
+ */
+ cond_resched();
+ return bio;
+}
- bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask);
- bio->bi_iter.bi_sector = sector;
- bio->bi_iter.bi_size = req_sects << 9;
- sector += req_sects;
- nr_sects -= req_sects;
-
- /*
- * We can loop for a long time in here, if someone does
- * full device discards (like mkfs). Be nice and allow
- * us to schedule out to avoid softlocking if preempt
- * is disabled.
- */
- cond_resched();
- }
+int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+{
+ struct bio *bio;
- *biop = bio;
+ while ((bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+ gfp_mask)))
+ *biop = bio_chain_and_submit(*biop, bio);
return 0;
}
EXPORT_SYMBOL(__blkdev_issue_discard);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e3483a16b75..8534c35e0497 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio,
blkcg_bio_issue_init(split);
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
submit_bio_noacct(bio);
return split;
}
@@ -779,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req)
if (blk_do_io_stat(req)) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -972,13 +975,7 @@ static void blk_account_io_merge_bio(struct request *req)
part_stat_unlock();
}
-enum bio_merge_status {
- BIO_MERGE_OK,
- BIO_MERGE_NONE,
- BIO_MERGE_FAILED,
-};
-
-static enum bio_merge_status bio_attempt_back_merge(struct request *req,
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const blk_opf_t ff = bio_failfast(bio);
@@ -994,6 +991,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
blk_update_mixed_merge(req, bio, false);
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_bio_merged(bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
@@ -1009,6 +1009,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
{
const blk_opf_t ff = bio_failfast(bio);
+ /*
+ * A front merge for writes to sequential zones of a zoned block device
+ * can happen only if the user submitted writes out of order. Do not
+ * merge such write to let it fail.
+ */
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ return BIO_MERGE_FAILED;
+
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -1107,10 +1115,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
- struct blk_plug *plug;
+ struct blk_plug *plug = current->plug;
struct request *rq;
- plug = blk_mq_plug(bio);
if (!plug || rq_list_empty(plug->mq_list))
return false;
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
deleted file mode 100644
index a77b099c34b7..000000000000
--- a/block/blk-mq-debugfs-zoned.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2017 Western Digital Corporation or its affiliates.
- */
-
-#include <linux/blkdev.h>
-#include "blk-mq-debugfs.h"
-
-int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- unsigned int i;
-
- if (!q->disk->seq_zones_wlock)
- return 0;
-
- for (i = 0; i < q->disk->nr_zones; i++)
- if (test_bit(i, q->disk->seq_zones_wlock))
- seq_printf(m, "%u\n", i);
-
- return 0;
-}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..770c0c2b72fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
- { "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+ { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL },
{ },
};
@@ -256,7 +256,6 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
- RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 9c7d4b6117d4..c80e453e3014 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
}
#endif
-#ifdef CONFIG_BLK_DEBUG_FS_ZONED
-int queue_zone_wlock_show(void *data, struct seq_file *m);
+#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
+int queue_zone_wplugs_show(void *data, struct seq_file *m);
#else
-static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+static inline int queue_zone_wplugs_show(void *data, struct seq_file *m)
{
return 0;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32afb87efbd0..8e01e4b32e10 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
+#include <linux/sched/isolation.h>
#include <trace/events/block.h>
@@ -690,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_zone_finish_request(rq);
+
if (rq->rq_flags & RQF_USE_SCHED) {
q->elevator->type->ops.finish_request(rq);
/*
@@ -761,31 +764,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (unlikely(error)) {
- bio->bi_status = error;
- } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size != nbytes)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- bio_advance(bio, nbytes);
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
-}
-
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
@@ -845,8 +823,7 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- bio->bi_iter.bi_sector = req->__sector;
+ blk_zone_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -889,6 +866,8 @@ static void blk_complete_request(struct request *req)
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
+ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+ bool quiet = req->rq_flags & RQF_QUIET;
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
@@ -909,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)) &&
- !test_bit(GD_DEAD, &req->q->disk->state)) {
+ if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
blk_print_req_error(req, error);
trace_block_rq_error(req, error, nr_bytes);
}
@@ -923,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size)
+ if (unlikely(error))
+ bio->bi_status = error;
+
+ if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+ } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported
+ * as the BIO fragments may end up not being written
+ * sequentially.
+ */
+ bio->bi_status = BLK_STS_IOERR;
+ }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
+ if (unlikely(quiet))
+ bio_set_flag(bio, BIO_QUIET);
+
+ bio_advance(bio, bio_bytes);
+
+ /* Don't actually finish bio if it's part of flush sequence */
+ if (!bio->bi_iter.bi_size) {
+ blk_zone_update_request_bio(req, bio);
+ if (!is_flush)
+ bio_endio(bio);
+ }
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
@@ -997,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -1019,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req)
part_stat_lock();
update_io_ticks(req->part, jiffies, false);
+ part_stat_local_inc(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -1330,11 +1333,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
blk_account_io_start(rq);
- /*
- * As plugging can be enabled for passthrough requests on a zoned
- * device, directly accessing the plug instead of using blk_mq_plug()
- * should not have any consequences.
- */
if (current->plug && !at_head) {
blk_add_rq_to_plug(current->plug, rq);
return;
@@ -1921,19 +1919,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
-static void blk_mq_handle_zone_resource(struct request *rq,
- struct list_head *zone_list)
-{
- /*
- * If we end up here it is because we cannot dispatch a request to a
- * specific zone due to LLD level zone-write locking or other zone
- * related resource not being available. In this case, set the request
- * aside in zone_list for retrying it later.
- */
- list_add(&rq->queuelist, zone_list);
- __blk_mq_requeue_request(rq);
-}
-
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
@@ -2019,7 +2004,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
struct request *rq;
int queued;
blk_status_t ret = BLK_STS_OK;
- LIST_HEAD(zone_list);
bool needs_resource = false;
if (list_empty(list))
@@ -2061,23 +2045,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
- case BLK_STS_ZONE_RESOURCE:
- /*
- * Move the request to zone_list and keep going through
- * the dispatch list to find more requests the drive can
- * accept.
- */
- blk_mq_handle_zone_resource(rq, &zone_list);
- needs_resource = true;
- break;
default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out:
- if (!list_empty(&zone_list))
- list_splice_tail_init(&zone_list, list);
-
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
@@ -2164,6 +2136,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
}
/*
+ * ->next_cpu is always calculated from hctx->cpumask, so simply use
+ * it for speeding up the check
+ */
+static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
+{
+ return hctx->next_cpu >= nr_cpu_ids;
+}
+
+/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
@@ -2174,7 +2155,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
bool tried = false;
int next_cpu = hctx->next_cpu;
- if (hctx->queue->nr_hw_queues == 1)
+ /* Switch to unbound if no allowable CPUs in this hctx */
+ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
@@ -2948,22 +2930,37 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(bio);
+ struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret;
+ /*
+ * If the plug has a cached request for this queue, try to use it.
+ */
+ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+ /*
+ * A BIO that was released from a zone write plug has already been
+ * through the preparation in this function, already holds a reference
+ * on the queue usage counter, and is the only write BIO in-flight for
+ * the target zone. Go straight to preparing a request for it.
+ */
+ if (bio_zone_write_plugging(bio)) {
+ nr_segs = bio->__bi_nr_segments;
+ if (rq)
+ blk_queue_exit(q);
+ goto new_request;
+ }
+
bio = blk_queue_bounce(bio, q);