From c4e21bcd0f9d01f9c5d6c52007f5541871a5b1de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jul 2023 11:42:38 +0200 Subject: block: cleanup queue_wc_store Get rid of the local queue_wc_store variable and handling setting and clearing the QUEUE_FLAG_WC flag diretly instead the if / else if. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230707094239.107968-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index afc797fb0dfc..0cde6598fb2f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -449,21 +449,13 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - int set = -1; - if (!strncmp(page, "write back", 10)) - set = 1; + blk_queue_flag_set(QUEUE_FLAG_WC, q); else if (!strncmp(page, "write through", 13) || !strncmp(page, "none", 4)) - set = 0; - - if (set == -1) - return -EINVAL; - - if (set) - blk_queue_flag_set(QUEUE_FLAG_WC, q); - else blk_queue_flag_clear(QUEUE_FLAG_WC, q); + else + return -EINVAL; return count; } -- cgit v1.2.3 From 43c9835b144c7ce29efe142d662529662a9eb376 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jul 2023 11:42:39 +0200 Subject: block: don't allow enabling a cache on devices that don't support it Currently the write_cache attribute allows enabling the QUEUE_FLAG_WC flag on devices that never claimed the capability. Fix that by adding a QUEUE_FLAG_HW_WC flag that is set by blk_queue_write_cache and guards re-enabling the cache through sysfs. Note that any rescan that calls blk_queue_write_cache will still re-enable the write cache as in the current code. Fixes: 93e9d8e836cb ("block: add ability to flag write back caching on a device") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230707094239.107968-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 7 +++++-- block/blk-sysfs.c | 11 +++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 4dd59059b788..0046b447268f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - if (wc) + if (wc) { + blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else { + blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0cde6598fb2f..63e481262336 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -449,13 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - if (!strncmp(page, "write back", 10)) + if (!strncmp(page, "write back", 10)) { + if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) + return -EINVAL; blk_queue_flag_set(QUEUE_FLAG_WC, q); - else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) + } else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) { blk_queue_flag_clear(QUEUE_FLAG_WC, q); - else + } else { return -EINVAL; + } return count; } -- cgit v1.2.3 From 660e802c76c89e871c29cd3174c07c8d23e39c35 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:55 +0800 Subject: blk-mq: use percpu csd to remote complete instead of per-rq csd If request need to be completed remotely, we insert it into percpu llist, and smp_call_function_single_async() if llist is empty previously. We don't need to use per-rq csd, percpu csd is enough. And the size of struct request is decreased by 24 bytes. This way is cleaner, and looks correct, given block softirq is guaranteed to be scheduled to consume the list if one new request is added to this percpu list, either smp_call_function_single_async() returns -EBUSY or 0. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-2-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d50b1d62a3d9..d98654869615 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -43,6 +43,7 @@ #include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); +static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, @@ -1157,15 +1158,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) static void blk_mq_complete_send_ipi(struct request *rq) { - struct llist_head *list; unsigned int cpu; cpu = rq->mq_ctx->cpu; - list = &per_cpu(blk_cpu_done, cpu); - if (llist_add(&rq->ipi_list, list)) { - INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); - smp_call_function_single_async(cpu, &rq->csd); - } + if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) + smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) @@ -4829,6 +4826,9 @@ static int __init blk_mq_init(void) for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); + for_each_possible_cpu(i) + INIT_CSD(&per_cpu(blk_cpu_csd, i), + __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, -- cgit v1.2.3 From 28b241237470981a96fbd82077c8044466b61e5f Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:56 +0800 Subject: blk-flush: fix rq->flush.seq for post-flush requests If the policy == (REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH), it means that the data sequence and post-flush sequence need to be done for this request. The rq->flush.seq should record what sequences have been done (or don't need to be done). So in this case, pre-flush doesn't need to be done, we should init rq->flush.seq to REQ_FSEQ_PREFLUSH not REQ_FSEQ_POSTFLUSH. Fixes: 615939a2ae73 ("blk-mq: defer to the normal submission path for post-flush requests") Signed-off-by: Chengming Zhou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-3-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 8220517c2d67..fdc489e0ea16 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -443,7 +443,7 @@ bool blk_insert_flush(struct request *rq) * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); - rq->flush.seq |= REQ_FSEQ_POSTFLUSH; + rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); spin_unlock_irq(&fq->mq_flush_lock); -- cgit v1.2.3 From b175c86739d38e41044d3136065f092a6d95aee6 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:57 +0800 Subject: blk-flush: count inflight flush_data requests The flush state machine use a double list to link all inflight flush_data requests, to avoid issuing separate post-flushes for these flush_data requests which shared PREFLUSH. So we can't reuse rq->queuelist, this is why we need rq->flush.list In preparation of the next patch that reuse rq->queuelist for flush state machine, we change the double linked list to unsigned long counter, which count all inflight flush_data requests. This is ok since we only need to know if there is any inflight flush_data request, so unsigned long counter is good. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-4-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 9 +++++---- block/blk.h | 5 ++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index fdc489e0ea16..fedb39031647 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -187,7 +187,8 @@ static void blk_flush_complete_seq(struct request *rq, break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + list_del_init(&rq->flush.list); + fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); list_add(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); @@ -299,7 +300,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, return; /* C2 and C3 */ - if (!list_empty(&fq->flush_data_in_flight) && + if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; @@ -374,6 +375,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + fq->flush_data_in_flight--; blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -445,7 +447,7 @@ bool blk_insert_flush(struct request *rq) blk_rq_init_flush(rq); rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: @@ -496,7 +498,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); - INIT_LIST_HEAD(&fq->flush_data_in_flight); return fq; diff --git a/block/blk.h b/block/blk.h index 608c5dcc516b..686712e13835 100644 --- a/block/blk.h +++ b/block/blk.h @@ -15,15 +15,14 @@ struct elevator_type; extern struct dentry *blk_debugfs_root; struct blk_flush_queue { + spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; - struct list_head flush_data_in_flight; + unsigned long flush_data_in_flight; struct request *flush_rq; - - spinlock_t mq_flush_lock; }; bool is_flush_rq(struct request *req); -- cgit v1.2.3 From 81ada09cc25e4bf2de7d2951925fb409338a545d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:58 +0800 Subject: blk-flush: reuse rq queuelist in flush state machine Since we don't need to maintain inflight flush_data requests list anymore, we can reuse rq->queuelist for flush pending list. Note in mq_flush_data_end_io(), we need to re-initialize rq->queuelist before reusing it in the state machine when end, since the rq->rq_next also reuse it, may have corrupted rq->queuelist by the driver. This patch decrease the size of struct request by 16 bytes. Signed-off-by: Chengming Zhou Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230717040058.3993930-5-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index fedb39031647..e73dc22d05c1 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -183,14 +183,13 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->flush.list, pending); + list_move_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: - list_del_init(&rq->flush.list); fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); - list_add(&rq->queuelist, &q->requeue_list); + list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; @@ -202,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - list_del_init(&rq->flush.list); + list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; @@ -258,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ - list_for_each_entry_safe(rq, n, running, flush.list) { + list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); @@ -292,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = - list_first_entry(pending, struct request, flush.list); + list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ @@ -376,6 +375,11 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; + /* + * May have been corrupted by rq->rq_next reuse, we need to + * re-initialize rq->queuelist before reusing it here. + */ + INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -386,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; - INIT_LIST_HEAD(&rq->flush.list); rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; -- cgit v1.2.3 From 8f63fef5867fb5e8c29d9c14b6d739bfc1869d32 Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Wed, 19 Jul 2023 17:46:08 +0530 Subject: block: refactor to use helper Reduce some code by making use of bio_integrity_bytes(). Signed-off-by: Nitesh Shetty Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230719121608.32105-1-nj.shetty@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4533eb491661..8f0af7ac8573 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - unsigned int intervals; blk_status_t status; if (!bi) @@ -224,10 +223,9 @@ bool bio_integrity_prep(struct bio *bio) !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; } - intervals = bio_integrity_intervals(bi, bio_sectors(bio)); /* Allocate kernel buffer for protection data */ - len = intervals * bi->tuple_size; + len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, GFP_NOIO); status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { -- cgit v1.2.3 From cd1d83e24e689f25de7e34bea697971750138d5f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:26 -0700 Subject: block: tidy up the bio full checks in bio_add_hw_page bio_add_hw_page already checks if the number of bytes trying to be added even fit into max_hw_sectors limit of the queue. Remove the call to bio_full and just do a check for the smaller of the number of segments in the bio and the queue max segments limit, and do this cheap check before the more expensive gap to previous check. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8672179213b9..72488ecea47a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1014,6 +1014,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) return len; + if (bio->bi_vcnt >= + min(bio->bi_max_vecs, queue_max_segments(q))) + return 0; + /* * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. @@ -1023,12 +1027,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; } - if (bio_full(bio, len)) - return 0; - - if (bio->bi_vcnt >= queue_max_segments(q)) - return 0; - bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bio->bi_vcnt++; bio->bi_iter.bi_size += len; -- cgit v1.2.3 From 6850b2dd5c25f27f7b74414553f047d4c12dd66c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:27 -0700 Subject: block: use SECTOR_SHIFT bio_add_hw_page Use the SECTOR_SHIFT magic constant instead of the magic number. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 72488ecea47a..445be4bdd99b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1007,7 +1007,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) return 0; if (bio->bi_vcnt > 0) { -- cgit v1.2.3 From 939e1a370330841b2c0292a483d7b38f3ee45f88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:28 -0700 Subject: block: move the BIO_CLONED checks out of __bio_try_merge_page __bio_try_merge_page is a way too low-level helper to assert that the bio is not cloned. Move the check into bio_add_page and bio_iov_iter_get_pages instead, which are the high level entry points that should enforce this variant. bio_add_hw_page already this check, coverig the third (indirect) caller of __bio_try_merge_page. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 445be4bdd99b..3ac72e60e7f1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -945,9 +945,6 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, static bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off, bool *same_page) { - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; @@ -1127,6 +1124,9 @@ int bio_add_page(struct bio *bio, struct page *page, { bool same_page = false; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { if (bio_full(bio, len)) return 0; @@ -1335,6 +1335,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { int ret = 0; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); -- cgit v1.2.3 From 0eca8b6f97ac705c5806f7d062207379094fb114 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:29 -0700 Subject: block: move the bi_vcnt check out of __bio_try_merge_page Move the bi_vcnt out of __bio_try_merge_page and into the two callers that don't already have it in preparation for additional changes to __bio_try_merge_page. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 3ac72e60e7f1..3dcbe98580dc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -945,20 +945,17 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, static bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off, bool *same_page) { - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!page_is_mergeable(bv, page, len, off, same_page)) + return false; + if (bio->bi_iter.bi_size > UINT_MAX - len) { + *same_page = false; + return false; } - return false; + bv->bv_len += len; + bio->bi_iter.bi_size += len; + return true; } /* @@ -1127,11 +1124,13 @@ int bio_add_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio, len)) - return 0; - __bio_add_page(bio, page, len, offset); - } + if (bio->bi_vcnt > 0 && + __bio_try_merge_page(bio, page, len, offset, &same_page)) + return len; + + if (bio_full(bio, len)) + return 0; + __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); @@ -1205,13 +1204,13 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - __bio_add_page(bio, page, len, offset); + if (bio->bi_vcnt > 0 && + __bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (same_page) + bio_release_page(bio, page); return 0; } - - if (same_page) - bio_release_page(bio, page); + __bio_add_page(bio, page, len, offset); return 0; } -- cgit v1.2.3 From 613699050a49760f1d70c74f71bd0b013ca3c356 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:30 -0700 Subject: block: move the bi_size overflow check in __bio_try_merge_page Checking for availability in bi_size in a function that attempts to merge into an existing segment is a bit odd, as the limit also applies when adding a new segment. This code works fine as we always call __bio_try_merge_page, but contributes to sub-optimal calling conventions and doesn't lead to clear code. Move it to two of the callers instead, the third one already has a more strict check that includes max_hw_segments anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 3dcbe98580dc..17f57fd2cff2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -949,10 +949,6 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page, if (!page_is_mergeable(bv, page, len, off, same_page)) return false; - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } bv->bv_len += len; bio->bi_iter.bi_size += len; return true; @@ -1123,6 +1119,8 @@ int bio_add_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return 0; if (bio->bi_vcnt > 0 && __bio_try_merge_page(bio, page, len, offset, &same_page)) @@ -1204,6 +1202,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; + if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) + return -EIO; + if (bio->bi_vcnt > 0 && __bio_try_merge_page(bio, page, len, offset, &same_page)) { if (same_page) -- cgit v1.2.3 From 80232b520314214d846eb0a65faef8b51b702fa7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:31 -0700 Subject: block: downgrade a bio_full call in bio_add_page bio_add_page already checks that there is space in bi_size a little earlier. So after we failed to add to an existing segment, just check that there is another one available instead of duplicating the bi_size check. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-7-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 17f57fd2cff2..d8e0e8de8cf4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1126,7 +1126,7 @@ int bio_add_page(struct bio *bio, struct page *page, __bio_try_merge_page(bio, page, len, offset, &same_page)) return len; - if (bio_full(bio, len)) + if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; -- cgit v1.2.3 From 858c708d9efb7e8e5c6320793b778cc17cf8368a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:32 -0700 Subject: block: move the bi_size update out of __bio_try_merge_page The update of bi_size is the only thing in __bio_try_merge_page that needs a bio. Move it to the callers, and merge __bio_try_merge_page and page_is_mergeable into a single bvec_try_merge_page that only takes the current bvec instead of a full bio. This will allow reusing this function for supporting multi-page integrity payload bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-8-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 57 ++++++++++++++++++++------------------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index d8e0e8de8cf4..23b7a001b500 100644 --- a/block/bio.c +++ b/block/bio.c @@ -903,9 +903,8 @@ static inline bool bio_full(struct bio *bio, unsigned len) return false; } -static inline bool page_is_mergeable(const struct bio_vec *bv, - struct page *page, unsigned int len, unsigned int off, - bool *same_page) +static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int off, bool *same_page) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -919,38 +918,14 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (*same_page) - return true; - else if (IS_ENABLED(CONFIG_KMSAN)) - return false; - return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); -} - -/** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. - * - * Warn if (@len, @off) crosses pages in case that @same_page is true. - * - * Return %true on success or %false on failure. - */ -static bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) -{ - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (!*same_page) { + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) + return false; + } - if (!page_is_mergeable(bv, page, len, off, same_page)) - return false; bv->bv_len += len; - bio->bi_iter.bi_size += len; return true; } @@ -972,7 +947,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, return false; if (bv->bv_len + len > queue_max_segment_size(q)) return false; - return __bio_try_merge_page(bio, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset, same_page); } /** @@ -1001,8 +976,11 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + if (bio_try_merge_hw_seg(q, bio, page, len, offset, + same_page)) { + bio->bi_iter.bi_size += len; return len; + } if (bio->bi_vcnt >= min(bio->bi_max_vecs, queue_max_segments(q))) @@ -1123,8 +1101,11 @@ int bio_add_page(struct bio *bio, struct page *page, return 0; if (bio->bi_vcnt > 0 && - __bio_try_merge_page(bio, page, len, offset, &same_page)) + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; return len; + } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; @@ -1206,7 +1187,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, return -EIO; if (bio->bi_vcnt > 0 && - __bio_try_merge_page(bio, page, len, offset, &same_page)) { + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; if (same_page) bio_release_page(bio, page); return 0; -- cgit v1.2.3 From ae42f0b3bf65912e122fc2e8d5f6d94b51156dba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:33 -0700 Subject: block: don't pass a bio to bio_try_merge_hw_seg There is no good reason to pass the bio to bio_try_merge_hw_seg. Just pass the current bvec and rename the function to bvec_try_merge_hw_page. This will allow reusing this function for supporting multi-page integrity payload bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-9-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 23b7a001b500..c92dda962449 100644 --- a/block/bio.c +++ b/block/bio.c @@ -934,11 +934,10 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, - struct page *page, unsigned len, - unsigned offset, bool *same_page) +static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; @@ -967,8 +966,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { - struct bio_vec *bvec; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; @@ -976,7 +973,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, same_page)) { bio->bi_iter.bi_size += len; return len; @@ -990,8 +989,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. */ - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bvec_gap_to_prev(&q->limits, bvec, offset)) + if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; } -- cgit v1.2.3 From 65a558f66c308251e256317957b75d1e643c33c3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 21 Jul 2023 10:27:30 -0700 Subject: block: Improve performance for BLK_MQ_F_BLOCKING drivers blk_mq_run_queue() runs the queue asynchronously if BLK_MQ_F_BLOCKING has been set. This is suboptimal since running the queue asynchronously is slower than running the queue synchronously. This patch modifies blk_mq_run_queue() as follows if BLK_MQ_F_BLOCKING has been set: - Run the queue synchronously if it is allowed to sleep. - Run the queue asynchronously if it is not allowed to sleep. Additionally, blk_mq_run_hw_queue(hctx, false) calls are modified into blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING) if the caller may be invoked from atomic context. The following caller chains have been reviewed: blk_mq_run_hw_queue(hctx, false) blk_mq_get_tag() /* may sleep, hence the functions it calls may also sleep */ blk_execute_rq() /* may sleep */ blk_mq_run_hw_queues(q, async=false) blk_freeze_queue_start() /* may sleep */ blk_mq_requeue_work() /* may sleep */ scsi_kick_queue() scsi_requeue_run_queue() /* may sleep */ scsi_run_host_queues() scsi_ioctl_reset() /* may sleep */ blk_mq_insert_requests(hctx, ctx, list, run_queue_async=false) blk_mq_dispatch_plug_list(plug, from_sched=false) blk_mq_flush_plug_list(plug, from_schedule=false) __blk_flush_plug(plug, from_schedule=false) blk_add_rq_to_plug() blk_mq_submit_bio() /* may sleep if REQ_NOWAIT has not been set */ blk_mq_plug_issue_direct() blk_mq_flush_plug_list() /* see above */ blk_mq_dispatch_plug_list(plug, from_sched=false) blk_mq_flush_plug_list() /* see above */ blk_mq_try_issue_directly() blk_mq_submit_bio() /* may sleep if REQ_NOWAIT has not been set */ blk_mq_try_issue_list_directly(hctx, list) blk_mq_insert_requests() /* see above */ Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230721172731.955724-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d98654869615..687ec3f4f10d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1323,7 +1323,7 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -2222,6 +2222,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) */ WARN_ON_ONCE(!async && in_interrupt()); + might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); + /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue @@ -2237,8 +2239,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (!need_run) return; - if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || - !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { + if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } @@ -2373,7 +2374,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -2403,7 +2404,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) unsigned long i; queue_for_each_hw_ctx(q, hctx, i) - blk_mq_start_stopped_hw_queue(hctx, async); + blk_mq_start_stopped_hw_queue(hctx, async || + (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -2461,6 +2463,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); + if (rq->cmd_flags & REQ_NOWAIT) + run_queue_async = true; } spin_lock(&ctx->lock); @@ -2621,7 +2625,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } -- cgit v1.2.3 From 51d74ec9b62f5813767a60226acaf943e26e7d7a Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Tue, 25 Jul 2023 14:18:39 +0900 Subject: block: cleanup bio_integrity_prep If a problem occurs in the process of creating an integrity payload, the status of bio is always BLK_STS_RESOURCE. Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230725051839epcms2p8e4d20ad6c51326ad032e8406f59d0aaa@epcms2p8 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8f0af7ac8573..045553a164e0 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - blk_status_t status; if (!bi) return true; @@ -227,7 +226,6 @@ bool bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, GFP_NOIO); - status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; @@ -242,7 +240,6 @@ bool bio_integrity_prep(struct bio *bio) if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - status = BLK_STS_RESOURCE; goto err_end_io; } @@ -270,7 +267,6 @@ bool bio_integrity_prep(struct bio *bio) if (ret == 0) { printk(KERN_ERR "could not attach integrity payload\n"); - status = BLK_STS_RESOURCE; goto err_end_io; } @@ -292,7 +288,7 @@ bool bio_integrity_prep(struct bio *bio) return true; err_end_io: - bio->bi_status = status; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; -- cgit v1.2.3 From 727cfe976758b79f8d2f8051c75a5ccb14539a56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:58 +0200 Subject: block: open code __generic_file_write_iter for blkdev writes Open code __generic_file_write_iter to remove the indirect call into ->direct_IO and to prepare using the iomap based write code. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20230801172201.1923299-4-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index a286bf3325c5..8a05d99166e3 100644 --- a/block/fops.c +++ b/block/fops.c @@ -533,6 +533,30 @@ static int blkdev_release(struct inode *inode, struct file *filp) return 0; } +static ssize_t +blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + size_t count = iov_iter_count(from); + ssize_t written; + + written = kiocb_invalidate_pages(iocb, count); + if (written) { + if (written == -EBUSY) + return 0; + return written; + } + + written = blkdev_direct_IO(iocb, from); + if (written > 0) { + kiocb_invalidate_post_direct_write(iocb, count); + iocb->ki_pos += written; + count -= written; + } + if (written != -EIOCBQUEUED) + iov_iter_revert(from, count - iov_iter_count(from)); + return written; +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -542,7 +566,8 @@ static int blkdev_release(struct inode *inode, struct file *filp) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; @@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - ret = __generic_file_write_iter(iocb, from); + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = blkdev_direct_write(iocb, from); + if (ret >= 0 && iov_iter_count(from)) + ret = direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); + } else { + ret = generic_perform_write(iocb, from); + } + if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); -- cgit v1.2.3 From a05f7bd9578b17521a9a5f3689f3934c082c6390 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:59 +0200 Subject: block: stop setting ->direct_IO Direct I/O on block devices now nevers goes through aops->direct_IO. Stop setting it and set the FMODE_CAN_ODIRECT in ->open instead. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230801172201.1923299-5-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index 8a05d99166e3..f0b822c28ddf 100644 --- a/block/fops.c +++ b/block/fops.c @@ -428,7 +428,6 @@ const struct address_space_operations def_blk_aops = { .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .direct_IO = blkdev_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; @@ -505,7 +504,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; /* * Use the file private data to store the holder for exclusive openes. -- cgit v1.2.3 From 487c607df790d366e67a7d6a30adf785cdd98e55 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:22:00 +0200 Subject: block: use iomap for writes to block devices Use iomap in buffer_head compat mode to write to block devices. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Pankaj Raghav Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230801172201.1923299-6-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 1 + block/fops.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 86122e459fe0..1a13ef0b1ca1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,6 +5,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select FS_IOMAP select SBITMAP help Provide block layer support for the kernel. diff --git a/block/fops.c b/block/fops.c index f0b822c28ddf..063ece37d44e 100644 --- a/block/fops.c +++ b/block/fops.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "blk.h" @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct block_device *bdev = I_BDEV(inode); + loff_t isize = i_size_read(inode); + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); + if (iomap->offset >= isize) + return -EIO; + iomap->type = IOMAP_MAPPED; + iomap->addr = iomap->offset; + iomap->length = isize - iomap->offset; + iomap->flags |= IOMAP_F_BUFFER_HEAD; + return 0; +} + +static const struct iomap_ops blkdev_iomap_ops = { + .iomap_begin = blkdev_iomap_begin, +}; + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) return written; } +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) +{ + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, - generic_perform_write(iocb, from)); + blkdev_buffered_write(iocb, from)); } else { - ret = generic_perform_write(iocb, from); + ret = blkdev_buffered_write(iocb, from); } if (ret > 0) -- cgit v1.2.3 From 925c86a19bacf8ce10eb666328fb3fa5aff7b951 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:22:01 +0200 Subject: fs: add CONFIG_BUFFER_HEAD Add a new config option that controls building the buffer_head code, and select it from all file systems and stacking drivers that need it. For the block device nodes and alternative iomap based buffered I/O path is provided when buffer_head support is not enabled, and iomap needs a a small tweak to define the IOMAP_F_BUFFER_HEAD flag to 0 to not call into the buffer_head code when it doesn't exist. Otherwise this is just Kconfig and ifdef changes. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230801172201.1923299-7-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index 063ece37d44e..eaa98a987213 100644 --- a/block/fops.c +++ b/block/fops.c @@ -24,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file) return file->f_mapping->host; } -static int blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; @@ -400,7 +391,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; - iomap->flags |= IOMAP_F_BUFFER_HEAD; + iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ return 0; } @@ -408,6 +399,16 @@ static const struct iomap_ops blkdev_iomap_ops = { .iomap_begin = blkdev_iomap_begin, }; +#ifdef CONFIG_BUFFER_HEAD +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -453,6 +454,55 @@ const struct address_space_operations def_blk_aops = { .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; +#else /* CONFIG_BUFFER_HEAD */ +static int blkdev_read_folio(struct file *file, struct folio *folio) +{ + return iomap_read_folio(folio, &blkdev_iomap_ops); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &blkdev_iomap_ops); +} + +static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + loff_t isize = i_size_read(inode); + + if (WARN_ON_ONCE(offset >= isize)) + return -EIO; + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + return blkdev_iomap_begin(inode, offset, isize - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops blkdev_writeback_ops = { + .map_blocks = blkdev_map_blocks, +}; + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); +} + +const struct address_space_operations def_blk_aops = { + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .read_folio = blkdev_read_folio, + .readahead = blkdev_readahead, + .writepages = blkdev_writepages, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .migrate_folio = filemap_migrate_folio, +}; +#endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero -- cgit v1.2.3 From d47f9717e5cfd0dd8c0ba2ecfa47c38d140f1bb6 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 3 Aug 2023 19:12:42 +0800 Subject: block/mq-deadline: use correct way to throttling write requests The original formula was inaccurate: dd->async_depth = max(1UL, 3 * q->nr_requests / 4); For write requests, when we assign a tags from sched_tags, data->shallow_depth will be passed to sbitmap_find_bit, see the following code: nr = sbitmap_find_bit_in_word(&sb->map[index], min_t (unsigned int, __map_depth(sb, index), depth), alloc_hint, wrap); The smaller of data->shallow_depth and __map_depth(sb, index) will be used as the maximum range when allocating bits. For a mmc device (one hw queue, deadline I/O scheduler): q->nr_requests = sched_tags = 128, so according to the previous calculation method, dd->async_depth = data->shallow_depth = 96, and the platform is 64bits with 8 cpus, sched_tags.bitmap_tags.sb.shift=5, sb.maps[]=32/32/32/32, 32 is smaller than 96, whether it is a read or a write I/O, tags can be allocated to the maximum range each time, which has not throttling effect. In addition, refer to the methods of bfg/kyber I/O scheduler, limit ratiois are calculated base on sched_tags.bitmap_tags.sb.shift. This patch can throttle write requests really. Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Signed-off-by: Zhiguo Niu Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/1691061162-22898-1-git-send-email-zhiguo.niu@unisoc.com Signed-off-by: Jens Axboe --- block/mq-deadline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 02a916ba62ee..f958e79277b8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags.sb.shift; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = max(1U, 3 * (1U << shift) / 4); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } -- cgit v1.2.3 From 7c8998f75d2d42ddefb172239b0f689392958309 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:48:27 +0900 Subject: block: make bvec_try_merge_hw_page() non-static This will be used for multi-page configuration for integrity payload. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803024827epcms2p838d9e9131492c86a159fff25d195658f@epcms2p8 Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index c92dda962449..8d1533af7c60 100644 --- a/block/bio.c +++ b/block/bio.c @@ -934,7 +934,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, bool *same_page) { diff --git a/block/blk.h b/block/blk.h index 686712e13835..9d22ec3a53bc 100644 --- a/block/blk.h +++ b/block/blk.h @@ -75,6 +75,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { -- cgit v1.2.3 From 80814b8e359f7207595f52702aea432a7bd61200 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:49:56 +0900 Subject: bio-integrity: update the payload size in bio_integrity_add_page() Previously, the bip's bi_size has been set before an integrity pages were added. If a problem occurs in the process of adding pages for bip, the bi_size mismatch problem must be dealt with. When the page is successfully added to bvec, the bi_size is updated. The parts affected by the change were also contained in this commit. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803024956epcms2p38186a17392706650c582d38ef3dbcd32@epcms2p3 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 045553a164e0..6220a99977a4 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -137,6 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; + bip->bip_iter.bi_size += len; return len; } @@ -244,7 +245,6 @@ bool bio_integrity_prep(struct bio *bio) } bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) -- cgit v1.2.3 From d1f04c2e23c99258049c6081c3147bae69e5bcb8 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:50:58 +0900 Subject: bio-integrity: cleanup adding integrity pages to bip's bvec. bio_integrity_add_page() returns the add length if successful, else 0, just as bio_add_page. Simply check return value checking in bio_integrity_prep to not deal with a > 0 but < len case that can't happen. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803025058epcms2p5a4d0db5da2ad967668932d463661c633@epcms2p5 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6220a99977a4..c6b3bc86e1f9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -252,27 +252,18 @@ bool bio_integrity_prep(struct bio *bio) /* Map it */ offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; + for (i = 0; i < nr_pages && len > 0; i++) { bytes = PAGE_SIZE - offset; - if (len <= 0) - break; - if (bytes > len) bytes = len; - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) { + if (bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset) < bytes) { printk(KERN_ERR "could not attach integrity payload\n"); goto err_end_io; } - if (ret < bytes) - break; - buf += bytes; len -= bytes; offset = 0; @@ -291,7 +282,6 @@ err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; - } EXPORT_SYMBOL(bio_integrity_prep); -- cgit v1.2.3 From 0ece1d649b6dd615925a72bc1824d6b9fa5b998a Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:52:02 +0900 Subject: bio-integrity: create multi-page bvecs in bio_integrity_add_page() In general, the bvec data structure consists of one for physically continuous pages. But, in the bvec configuration for bip, physically continuous integrity pages are composed of each bvec. Allow bio_integrity_add_page() to create multi-page bvecs, just like the bio payloads. This simplifies adding larger payloads, and fixes support for non-tiny workloads with nvme, which stopped using scatterlist for metadata a while ago. Cc: Christoph Hellwig Cc: Martin K. Petersen Fixes: 783b94bd9250 ("nvme-pci: do not build a scatterlist to map metadata") Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803025202epcms2p82f57cbfe32195da38c776377b55aed59@epcms2p8 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c6b3bc86e1f9..ec8ac8cf6e1b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -123,17 +123,34 @@ void bio_integrity_free(struct bio *bio) int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); - if (bip->bip_vcnt >= bip->bip_max_vcnt) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); + if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > + queue_max_hw_sectors(q)) return 0; - } - if (bip->bip_vcnt && - bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, - &bip->bip_vec[bip->bip_vcnt - 1], offset)) - return 0; + if (bip->bip_vcnt > 0) { + struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + bool same_page = false; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + &same_page)) { + bip->bip_iter.bi_size += len; + return len; + } + + if (bip->bip_vcnt >= + min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) + return 0; + + /* + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. + */ + if (bvec_gap_to_prev(&q->limits, bv, offset)) + return 0; + } bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; -- cgit v1.2.3 From 4eb44d10766ac0fae5973998fd2a0103df1d3fe1 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 10 Aug 2023 11:51:11 +0800 Subject: block: remove init_mutex and open-code blk_iolatency_try_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a13696b83da4 ("blk-iolatency: Make initialization lazy") adds a mutex named "init_mutex" in blk_iolatency_try_init for the race condition of initializing RQ_QOS_LATENCY. Now a new lock has been add to struct request_queue by commit a13bd91be223 ("block/rq_qos: protect rq_qos apis with a new lock"). And it has been held in blkg_conf_open_bdev before calling blk_iolatency_init. So it's not necessary to keep init_mutex in blk_iolatency_try_init, just remove it. Since init_mutex has been removed, blk_iolatency_try_init can be open-coded back to iolatency_set_limit() like ioc_qos_write(). Signed-off-by: Li Lingfeng Reviewed-by: Michal Koutný Link: https://lore.kernel.org/r/20230810035111.2236335-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index fd5fec989e39..c16aef4be036 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -824,29 +824,6 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg)