summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 20:21:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 20:21:42 -0700
commit3d3dfeb3aec7b612d266d500c82054f1fded4980 (patch)
tree11649eab5c74deb74e6e0879613e8053ae3b9970 /block
parentc1b7fcf3f6d94c2c3528bf77054bf174a5ef63d7 (diff)
parent146afeb235ccec10c17ad8ea26327c0c79dbd968 (diff)
downloadlinux-3d3dfeb3aec7b612d266d500c82054f1fded4980.tar.gz
linux-3d3dfeb3aec7b612d266d500c82054f1fded4980.tar.bz2
linux-3d3dfeb3aec7b612d266d500c82054f1fded4980.zip
Merge tag 'for-6.6/block-2023-08-28' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: "Pretty quiet round for this release. This contains: - Add support for zoned storage to ublk (Andreas, Ming) - Series improving performance for drivers that mark themselves as needing a blocking context for issue (Bart) - Cleanup the flush logic (Chengming) - sed opal keyring support (Greg) - Fixes and improvements to the integrity support (Jinyoung) - Add some exports for bcachefs that we can hopefully delete again in the future (Kent) - deadline throttling fix (Zhiguo) - Series allowing building the kernel without buffer_head support (Christoph) - Sanitize the bio page adding flow (Christoph) - Write back cache fixes (Christoph) - MD updates via Song: - Fix perf regression for raid0 large sequential writes (Jan) - Fix split bio iostat for raid0 (David) - Various raid1 fixes (Heinz, Xueshi) - raid6test build fixes (WANG) - Deprecate bitmap file support (Christoph) - Fix deadlock with md sync thread (Yu) - Refactor md io accounting (Yu) - Various non-urgent fixes (Li, Yu, Jack) - Various fixes and cleanups (Arnd, Azeem, Chengming, Damien, Li, Ming, Nitesh, Ruan, Tejun, Thomas, Xu)" * tag 'for-6.6/block-2023-08-28' of git://git.kernel.dk/linux: (113 commits) block: use strscpy() to instead of strncpy() block: sed-opal: keyring support for SED keys block: sed-opal: Implement IOC_OPAL_REVERT_LSP block: sed-opal: Implement IOC_OPAL_DISCOVERY blk-mq: prealloc tags when increase tagset nr_hw_queues blk-mq: delete redundant tagset map update when fallback blk-mq: fix tags leak when shrink nr_hw_queues ublk: zoned: support REQ_OP_ZONE_RESET_ALL md: raid0: account for split bio in iostat accounting md/raid0: Fix performance regression for large sequential writes md/raid0: Factor out helper for mapping and submitting a bio md raid1: allow writebehind to work on any leg device set WriteMostly md/raid1: hold the barrier until handle_read_error() finishes md/raid1: free the r1bio before waiting for blocked rdev md/raid1: call free_r1bio() before allow_barrier() in raid_end_bio_io() blk-cgroup: Fix NULL deref caused by blkg_policy_data being installed before init drivers/rnbd: restore sysfs interface to rnbd-client md/raid5-cache: fix null-ptr-deref for r5l_flush_stripe_to_raid() raid6: test: only check for Altivec if building on powerpc hosts raid6: test: make sure all intermediate and artifact files are .gitignored ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig3
-rw-r--r--block/bio-integrity.c59
-rw-r--r--block/bio.c142
-rw-r--r--block/blk-cgroup.c32
-rw-r--r--block/blk-core.c1
-rw-r--r--block/blk-flush.c26
-rw-r--r--block/blk-iolatency.c35
-rw-r--r--block/blk-mq.c45
-rw-r--r--block/blk-settings.c7
-rw-r--r--block/blk-sysfs.c21
-rw-r--r--block/blk.h10
-rw-r--r--block/fops.c143
-rw-r--r--block/mq-deadline.c3
-rw-r--r--block/opal_proto.h4
-rw-r--r--block/partitions/cmdline.c12
-rw-r--r--block/sed-opal.c252
16 files changed, 579 insertions, 216 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 86122e459fe0..f1364d1c0d93 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+ select FS_IOMAP
select SBITMAP
help
Provide block layer support for the kernel.
@@ -183,6 +184,8 @@ config BLK_DEBUG_FS_ZONED
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
+ depends on KEYS
+ select PSERIES_PLPKS if PPC_PSERIES
help
Builds Logic for interfacing with Opal enabled controllers.
Enabling this option enables users to setup/unlock/lock
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4533eb491661..ec8ac8cf6e1b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -123,20 +123,38 @@ void bio_integrity_free(struct bio *bio)
int bio_integrity_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct bio_integrity_payload *bip = bio_integrity(bio);
- if (bip->bip_vcnt >= bip->bip_max_vcnt) {
- printk(KERN_ERR "%s: bip_vec full\n", __func__);
+ if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) >
+ queue_max_hw_sectors(q))
return 0;
- }
- if (bip->bip_vcnt &&
- bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits,
- &bip->bip_vec[bip->bip_vcnt - 1], offset))
- return 0;
+ if (bip->bip_vcnt > 0) {
+ struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
+ bool same_page = false;
+
+ if (bvec_try_merge_hw_page(q, bv, page, len, offset,
+ &same_page)) {
+ bip->bip_iter.bi_size += len;
+ return len;
+ }
+
+ if (bip->bip_vcnt >=
+ min(bip->bip_max_vcnt, queue_max_integrity_segments(q)))
+ return 0;
+
+ /*
+ * If the queue doesn't support SG gaps and adding this segment
+ * would create a gap, disallow it.
+ */
+ if (bvec_gap_to_prev(&q->limits, bv, offset))
+ return 0;
+ }
bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset);
bip->bip_vcnt++;
+ bip->bip_iter.bi_size += len;
return len;
}
@@ -199,8 +217,6 @@ bool bio_integrity_prep(struct bio *bio)
unsigned long start, end;
unsigned int len, nr_pages;
unsigned int bytes, offset, i;
- unsigned int intervals;
- blk_status_t status;
if (!bi)
return true;
@@ -224,12 +240,10 @@ bool bio_integrity_prep(struct bio *bio)
!(bi->flags & BLK_INTEGRITY_GENERATE))
return true;
}
- intervals = bio_integrity_intervals(bi, bio_sectors(bio));
/* Allocate kernel buffer for protection data */
- len = intervals * bi->tuple_size;
+ len = bio_integrity_bytes(bi, bio_sectors(bio));
buf = kmalloc(len, GFP_NOIO);
- status = BLK_STS_RESOURCE;
if (unlikely(buf == NULL)) {
printk(KERN_ERR "could not allocate integrity buffer\n");
goto err_end_io;
@@ -244,12 +258,10 @@ bool bio_integrity_prep(struct bio *bio)
if (IS_ERR(bip)) {
printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf);
- status = BLK_STS_RESOURCE;
goto err_end_io;
}
bip->bip_flags |= BIP_BLOCK_INTEGRITY;
- bip->bip_iter.bi_size = len;
bip_set_seed(bip, bio->bi_iter.bi_sector);
if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
@@ -257,28 +269,18 @@ bool bio_integrity_prep(struct bio *bio)
/* Map it */
offset = offset_in_page(buf);
- for (i = 0 ; i < nr_pages ; i++) {
- int ret;
+ for (i = 0; i < nr_pages && len > 0; i++) {
bytes = PAGE_SIZE - offset;
- if (len <= 0)
- break;
-
if (bytes > len)
bytes = len;
- ret = bio_integrity_add_page(bio, virt_to_page(buf),
- bytes, offset);
-
- if (ret == 0) {
+ if (bio_integrity_add_page(bio, virt_to_page(buf),
+ bytes, offset) < bytes) {
printk(KERN_ERR "could not attach integrity payload\n");
- status = BLK_STS_RESOURCE;
goto err_end_io;
}
- if (ret < bytes)
- break;
-
buf += bytes;
len -= bytes;
offset = 0;
@@ -294,10 +296,9 @@ bool bio_integrity_prep(struct bio *bio)
return true;
err_end_io:
- bio->bi_status = status;
+ bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio);
return false;
-
}
EXPORT_SYMBOL(bio_integrity_prep);
diff --git a/block/bio.c b/block/bio.c
index 8672179213b9..816d412c06e9 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
}
EXPORT_SYMBOL(bio_kmalloc);
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
struct bio_vec bv;
struct bvec_iter iter;
- bio_for_each_segment(bv, bio, iter)
+ __bio_for_each_segment(bv, bio, iter, start)
memzero_bvec(&bv);
}
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
/**
* bio_truncate - truncate the bio to small size of @new_size
@@ -903,9 +903,8 @@ static inline bool bio_full(struct bio *bio, unsigned len)
return false;
}
-static inline bool page_is_mergeable(const struct bio_vec *bv,
- struct page *page, unsigned int len, unsigned int off,
- bool *same_page)
+static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
+ unsigned int len, unsigned int off, bool *same_page)
{
size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@@ -919,49 +918,15 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
- if (*same_page)
- return true;
- else if (IS_ENABLED(CONFIG_KMSAN))
- return false;
- return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
-}
-
-/**
- * __bio_try_merge_page - try appending data to an existing bvec.
- * @bio: destination bio
- * @page: start page to add
- * @len: length of the data to add
- * @off: offset of the data relative to @page
- * @same_page: return if the segment has been merged inside the same page
- *
- * Try to add the data at @page + @off to the last bvec of @bio. This is a
- * useful optimisation for file systems with a block size smaller than the
- * page size.
- *
- * Warn if (@len, @off) crosses pages in case that @same_page is true.
- *
- * Return %true on success or %false on failure.
- */
-static bool __bio_try_merge_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int off, bool *same_page)
-{
- if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
- return false;
-
- if (bio->bi_vcnt > 0) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (page_is_mergeable(bv, page, len, off, same_page)) {
- if (bio->bi_iter.bi_size > UINT_MAX - len) {
- *same_page = false;
- return false;
- }
- bv->bv_len += len;
- bio->bi_iter.bi_size += len;
- return true;
- }
+ if (!*same_page) {
+ if (IS_ENABLED(CONFIG_KMSAN))
+ return false;
+ if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
+ return false;
}
- return false;
+
+ bv->bv_len += len;
+ return true;
}
/*
@@ -969,11 +934,10 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page,
* size limit. This is not for normal read/write bios, but for passthrough
* or Zone Append operations that we can't split.
*/
-static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned len,
- unsigned offset, bool *same_page)
+bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+ struct page *page, unsigned len, unsigned offset,
+ bool *same_page)
{
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
@@ -982,7 +946,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
return false;
if (bv->bv_len + len > queue_max_segment_size(q))
return false;
- return __bio_try_merge_page(bio, page, len, offset, same_page);
+ return bvec_try_merge_page(bv, page, len, offset, same_page);
}
/**
@@ -1002,33 +966,33 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page)
{
- struct bio_vec *bvec;
-
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+ if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors)
return 0;
if (bio->bi_vcnt > 0) {
- if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (bvec_try_merge_hw_page(q, bv, page, len, offset,
+ same_page)) {
+ bio->bi_iter.bi_size += len;
return len;
+ }
+
+ if (bio->bi_vcnt >=
+ min(bio->bi_max_vecs, queue_max_segments(q)))
+ return 0;
/*
* If the queue doesn't support SG gaps and adding this segment
* would create a gap, disallow it.
*/
- bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
- if (bvec_gap_to_prev(&q->limits, bvec, offset))
+ if (bvec_gap_to_prev(&q->limits, bv, offset))
return 0;
}
- if (bio_full(bio, len))
- return 0;
-
- if (bio->bi_vcnt >= queue_max_segments(q))
- return 0;
-
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
bio->bi_vcnt++;
bio->bi_iter.bi_size += len;
@@ -1129,11 +1093,21 @@ int bio_add_page(struct bio *bio, struct page *page,
{
bool same_page = false;
- if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- if (bio_full(bio, len))
- return 0;
- __bio_add_page(bio, page, len, offset);
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+ return 0;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return 0;
+
+ if (bio->bi_vcnt > 0 &&
+ bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+ page, len, offset, &same_page)) {
+ bio->bi_iter.bi_size += len;
+ return len;
}
+
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return 0;
+ __bio_add_page(bio, page, len, offset);
return len;
}
EXPORT_SYMBOL(bio_add_page);
@@ -1207,13 +1181,18 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
{
bool same_page = false;
- if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- __bio_add_page(bio, page, len, offset);
+ if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
+ return -EIO;
+
+ if (bio->bi_vcnt > 0 &&
+ bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+ page, len, offset, &same_page)) {
+ bio->bi_iter.bi_size += len;
+ if (same_page)
+ bio_release_page(bio, page);
return 0;
}
-
- if (same_page)
- bio_release_page(bio, page);
+ __bio_add_page(bio, page, len, offset);
return 0;
}
@@ -1252,7 +1231,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
struct page **pages = (struct page **)bv;
ssize_t size, left;
unsigned len, i = 0;
- size_t offset, trim;
+ size_t offset;
int ret = 0;
/*
@@ -1281,10 +1260,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
- trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
- iov_iter_revert(iter, trim);
+ if (bio->bi_bdev) {
+ size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
+ iov_iter_revert(iter, trim);
+ size -= trim;
+ }
- size -= trim;
if (unlikely(!size)) {
ret = -EFAULT;
goto out;
@@ -1337,6 +1318,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
int ret = 0;
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+ return -EIO;
+
if (iov_iter_is_bvec(iter)) {
bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, bio->bi_iter.bi_size);
@@ -1490,6 +1474,7 @@ void bio_set_pages_dirty(struct bio *bio)
set_page_dirty_lock(bvec->bv_page);
}
}
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@@ -1549,6 +1534,7 @@ defer:
spin_unlock_irqrestore(&bio_dirty_lock, flags);
schedule_work(&bio_dirty_work);
}
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
static inline bool bio_remaining_done(struct bio *bio)
{
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9faafcd10e17..4a42ea2972ad 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1511,7 +1511,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
retry:
spin_lock_irq(&q->queue_lock);
- /* blkg_list is pushed at the head, reverse walk to allocate parents first */
+ /* blkg_list is pushed at the head, reverse walk to initialize parents first */
list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd;
@@ -1549,21 +1549,20 @@ retry:
goto enomem;
}
- blkg->pd[pol->plid] = pd;
+ spin_lock(&blkg->blkcg->lock);
+
pd->blkg = blkg;
pd->plid = pol->plid;
- pd->online = false;
- }
+ blkg->pd[pol->plid] = pd;
- /* all allocated, init in the same order */
- if (pol->pd_init_fn)
- list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
- pol->pd_init_fn(blkg->pd[pol->plid]);
+ if (pol->pd_init_fn)
+ pol->pd_init_fn(pd);
- list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
if (pol->pd_online_fn)
- pol->pd_online_fn(blkg->pd[pol->plid]);
- blkg->pd[pol->plid]->online = true;
+ pol->pd_online_fn(pd);
+ pd->online = true;
+
+ spin_unlock(&blkg->blkcg->lock);
}
__set_bit(pol->plid, q->blkcg_pols);
@@ -1580,14 +1579,19 @@ out:
return ret;
enomem:
- /* alloc failed, nothing's initialized yet, free everything */
+ /* alloc failed, take down everything */
spin_lock_irq(&q->queue_lock);
list_for_each_entry(blkg, &q->blkg_list, q_node) {
struct blkcg *blkcg = blkg->blkcg;
+ struct blkg_policy_data *pd;
spin_lock(&blkcg->lock);
- if (blkg->pd[pol->plid]) {
- pol->pd_free_fn(blkg->pd[pol->plid]);
+ pd = blkg->pd[pol->plid];
+ if (pd) {
+ if (pd->online && pol->pd_offline_fn)
+ pol->pd_offline_fn(pd);
+ pd->online = false;
+ pol->pd_free_fn(pd);
blkg->pd[pol->plid] = NULL;
}
spin_unlock(&blkcg->lock);
diff --git a/block/blk-core.c b/block/blk-core.c
index 9866468c72a2..9d51e9894ece 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status)
return "<null>";
return blk_errors[idx].name;
}
+EXPORT_SYMBOL_GPL(blk_status_to_str);
/**
* blk_sync_queue - cancel any pending callbacks on a queue
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8220517c2d67..e73dc22d05c1 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -183,13 +183,13 @@ static void blk_flush_complete_seq(struct request *rq,
/* queue for flush */
if (list_empty(pending))
fq->flush_pending_since = jiffies;
- list_move_tail(&rq->flush.list, pending);
+ list_move_tail(&rq->queuelist, pending);
break;
case REQ_FSEQ_DATA:
- list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+ fq->flush_data_in_flight++;
spin_lock(&q->requeue_lock);
- list_add(&rq->queuelist, &q->requeue_list);
+ list_move(&rq->queuelist, &q->requeue_list);
spin_unlock(&q->requeue_lock);
blk_mq_kick_requeue_list(q);
break;
@@ -201,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq,
* flush data request completion path. Restore @rq for
* normal completion and end it.
*/
- list_del_init(&rq->flush.list);
+ list_del_init(&rq->queuelist);
blk_flush_restore_request(rq);
blk_mq_end_request(rq, error);
break;
@@ -257,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
fq->flush_running_idx ^= 1;
/* and push the waiting requests to the next stage */
- list_for_each_entry_safe(rq, n, running, flush.list) {
+ list_for_each_entry_safe(rq, n, running, queuelist) {
unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
@@ -291,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
{
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
struct request *first_rq =
- list_first_entry(pending, struct request, flush.list);
+ list_first_entry(pending, struct request, queuelist);
struct request *flush_rq = fq->flush_rq;
/* C1 described at the top of this file */
@@ -299,7 +299,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
return;
/* C2 and C3 */
- if (!list_empty(&fq->flush_data_in_flight) &&
+ if (fq->flush_data_in_flight &&
time_before(jiffies,
fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
return;
@@ -374,6 +374,12 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
* the comment in flush_end_io().
*/
spin_lock_irqsave(&fq->mq_flush_lock, flags);
+ fq->flush_data_in_flight--;
+ /*
+ * May have been corrupted by rq->rq_next reuse, we need to
+ * re-initialize rq->queuelist before reusing it here.
+ */
+ INIT_LIST_HEAD(&rq->queuelist);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
@@ -384,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
static void blk_rq_init_flush(struct request *rq)
{
rq->flush.seq = 0;
- INIT_LIST_HEAD(&rq->flush.list);
rq->rq_flags |= RQF_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
rq->end_io = mq_flush_data_end_io;
@@ -443,9 +448,9 @@ bool blk_insert_flush(struct request *rq)
* the post flush, and then just pass the command on.
*/
blk_rq_init_flush(rq);
- rq->flush.seq |= REQ_FSEQ_POSTFLUSH;
+ rq->flush.seq |= REQ_FSEQ_PREFLUSH;
spin_lock_irq(&fq->mq_flush_lock);
- list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+ fq->flush_data_in_flight++;
spin_unlock_irq(&fq->mq_flush_lock);
return false;
default:
@@ -496,7 +501,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
INIT_LIST_HEAD(&fq->flush_queue[0]);
INIT_LIST_HEAD(&fq->flush_queue[1]);
- INIT_LIST_HEAD(&fq->flush_data_in_flight);
return fq;
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index fd5fec989e39..c1a6aba1d59e 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -824,29 +824,6 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg)
}
}
-static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx)
-{
- static DEFINE_MUTEX(init_mutex);
- int ret;
-
- ret = blkg_conf_open_bdev(ctx);
- if (ret)
- return ret;
-
- /*
- * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
- * confuse iolat_rq_qos() test. Make the test and init atomic.
- */
- mutex_lock(&init_mutex);
-
- if (!iolat_rq_qos(ctx->bdev->bd_queue))
- ret = blk_iolatency_init(ctx->bdev->bd_disk);
-
- mutex_unlock(&init_mutex);
-
- return ret;
-}
-
static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
@@ -861,7 +838,17 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
blkg_conf_init(&ctx, buf);
- ret = blk_iolatency_try_init(&ctx);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto out;
+
+ /*
+ * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
+ * confuse iolat_rq_qos() test. Make the test and init atomic.
+ */
+ lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex);
+ if (!iolat_rq_qos(ctx.bdev->bd_queue))
+ ret = blk_iolatency_init(ctx.bdev->bd_disk);
if (ret)
goto out;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 953f08354c8c..ec922c6bccbe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -43,6 +43,7 @@
#include "blk-ioprio.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
+static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
static void blk_mq_request_bypass_insert(struct request *rq,
@@ -1174,15 +1175,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
static void blk_mq_complete_send_ipi(struct request *rq)
{
- struct llist_head *list;
unsigned int cpu;
cpu = rq->mq_ctx->cpu;
- list = &per_cpu(blk_cpu_done, cpu);
- if (llist_add(&rq->ipi_list, list)) {
- INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
- smp_call_function_single_async(cpu, &rq->csd);
- }
+ if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
+ smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
}
static void blk_mq_raise_softirq(struct request *rq)
@@ -1343,7 +1340,7 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
}
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
- blk_mq_run_hw_queue(hctx, false);
+ blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
@@ -2242,6 +2239,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
*/
WARN_ON_ONCE(!async && in_interrupt());
+ might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
+
/*
* When queue is quiesced, we may be switching io scheduler, or
* updating nr_hw_queues, or other things, and we can't run queue
@@ -2257,8 +2256,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
if (!need_run)
return;
- if (async || (hctx->flags & BLK_MQ_F_BLOCKING) ||
- !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
+ if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0);
return;
}
@@ -2393,7 +2391,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
- blk_mq_run_hw_queue(hctx, false);
+ blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);
@@ -2423,7 +2421,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_start_stopped_hw_queue(hctx, async);
+ blk_mq_start_stopped_hw_queue(hctx, async ||
+ (hctx->flags & BLK_MQ_F_BLOCKING));
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -2481,6 +2480,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
trace_block_rq_insert(rq);
+ if (rq->cmd_flags & REQ_NOWAIT)
+ run_queue_async = true;
}
spin_lock(&ctx->lock);
@@ -2641,7 +2642,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
blk_mq_insert_request(rq, 0);
- blk_mq_run_hw_queue(hctx, false);
+ blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
return;
}
@@ -4402,9 +4403,13 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
+ int i;
- if (set->nr_hw_queues >= new_nr_hw_queues)
+ if (set->nr_hw_queues >= new_nr_hw_queues) {
+ for (i = new_nr_hw_queues; i < set->nr_hw_queues; i++)
+ __blk_mq_free_map_and_rqs(set, i);
goto done;
+ }
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
@@ -4416,6 +4421,16 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
+
+ for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
+ if (!__blk_mq_alloc_map_and_rqs(set, i)) {
+ while (--i >= set->nr_hw_queues)
+ __blk_mq_free_map_and_rqs(set, i);
+ return -ENOMEM;
+ }
+ cond_resched();
+ }
+
done:
set->nr_hw_queues = new_nr_hw_queues;
return 0;
@@ -4749,7 +4764,6 @@ fallback:
__blk_mq_free_map_and_rqs(set, i);
set->nr_hw_queues = prev_nr_hw_queues;
- blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);
@@ -4853,6 +4867,9 @@ static int __init blk_mq_init(void)
for_each_possible_cpu(i)
init_llist_head(&per_cpu(blk_cpu_done, i));
+ for_each_possible_cpu(i)
+ INIT_CSD(&per_cpu(blk_cpu_csd, i),
+ __blk_mq_complete_request_remote, NULL);
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4dd59059b788..0046b447268f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth);
*/
void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
{
- if (wc)
+ if (wc) {
+ blk_queue_flag_set(QUEUE_FLAG_HW_WC, q);
blk_queue_flag_set(QUEUE_FLAG_WC, q);
- else
+ } else {
+ blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q);
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
+ }
if (fua)
blk_queue_flag_set(QUEUE_FLAG_FUA, q);
else
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index afc797fb0dfc..63e481262336 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -449,21 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page)
static ssize_t queue_wc_store(struct request_queue *q, const char *page,
size_t count)
{
- int set = -1;
-
- if (!strncmp(page, "write back", 10))
- set = 1;
- else if (!strncmp(page, "write through", 13) ||
- !strncmp(page, "none", 4))
- set = 0;
-
- if (set == -1)
- return -EINVAL;
-
- if (set)
+ if (!strncmp(page, "write back", 10)) {
+ if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags))
+ return -EINVAL;
blk_queue_flag_set(QUEUE_FLAG_WC, q);
- else
+ } else if (!strncmp(page, "write through", 13) ||
+ !strncmp(page, "none", 4)) {
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
+ } else {
+ return -EINVAL;
+ }
return count;
}
diff --git a/block/blk.h b/block/blk.h
index 608c5dcc516b..08a358bc0919 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,15 +15,14 @@ struct elevator_type;
extern struct dentry *blk_debugfs_root;
struct blk_flush_queue {
+ spinlock_t mq_flush_lock;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
blk_status_t rq_status;
unsigned long flush_pending_since;
struct list_head flush_queue[2];
- struct list_head flush_data_in_flight;
+ unsigned long flush_data_in_flight;
struct request *flush_rq;
-
- spinlock_t mq_flush_lock;
};
bool is_flush_rq(struct request *req);
@@ -76,6 +75,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
+bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+ struct page *page, unsigned len, unsigned offset,
+ bool *same_page);
+
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
{
@@ -251,7 +254,6 @@ static inline void bio_integrity_free(struct bio *bio)
</