From e73a625bc24880f1fe5abaa89bb63e0918fbd66c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Sep 2022 09:19:59 -0600 Subject: block: kill deprecated BUG_ON() in the flush handling We've never had any useful reports from this BUG_ON(), and in fact a number of the BUG_ON()'s in the flush handling need to be turned into more graceful handling. In preparation for allowing batched completions of the end_io handling, where we can enter the flush completion with queuelist having been reused for the batch, get rid of this BUG_ON(). Signed-off-by: Jens Axboe --- block/blk-flush.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index d20a0c6b2c66..27705fc584a0 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -205,7 +205,6 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); -- cgit v1.2.3 From 4b6a5d9cea911424e84107df8c4eb8317938d2cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 08:22:09 -0600 Subject: block: enable batched allocation for blk_mq_alloc_request() The filesystem IO path can take advantage of allocating batches of requests, if the underlying submitter tells the block layer about it through the blk_plug. For passthrough IO, the exported API is the blk_mq_alloc_request() helper, and that one does not allow for request caching. Wire up request caching for blk_mq_alloc_request(), which is generally done without having a bio available upfront. Tested-by: Anuj Gupta Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 83492d942348..b32f70f38c6e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -510,25 +510,87 @@ retry: alloc_time_ns); } -struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, - blk_mq_req_flags_t flags) +static struct request *blk_mq_rq_cache_fill(struct request_queue *q, + struct blk_plug *plug, + blk_opf_t opf, + blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, - .nr_tags = 1, + .nr_tags = plug->nr_ios, + .cached_rq = &plug->cached_rq, }; struct request *rq; - int ret; - ret = blk_queue_enter(q, flags); - if (ret) - return ERR_PTR(ret); + if (blk_queue_enter(q, flags)) + return NULL; + + plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); - if (!rq) - goto out_queue_exit; + if (unlikely(!rq)) + blk_queue_exit(q); + return rq; +} + +static struct request *blk_mq_alloc_cached_request(struct request_queue *q, + blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct blk_plug *plug = current->plug; + struct request *rq; + + if (!plug) + return NULL; + if (rq_list_empty(plug->cached_rq)) { + if (plug->nr_ios == 1) + return NULL; + rq = blk_mq_rq_cache_fill(q, plug, opf, flags); + if (rq) + goto got_it; + return NULL; + } + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + + if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + + plug->cached_rq = rq_list_next(rq); +got_it: + rq->cmd_flags = opf; + INIT_LIST_HEAD(&rq->queuelist); + return rq; +} + +struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct request *rq; + + rq = blk_mq_alloc_cached_request(q, opf, flags); + if (!rq) { + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = opf, + .nr_tags = 1, + }; + int ret; + + ret = blk_queue_enter(q, flags); + if (ret) + return ERR_PTR(ret); + + rq = __blk_mq_alloc_requests(&data); + if (!rq) + goto out_queue_exit; + } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; -- cgit v1.2.3 From de671d6116b5210097cd6fbb877bac92536f265b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 15:19:54 -0600 Subject: block: change request end_io handler to pass back a return value Everything is just converted to returning RQ_END_IO_NONE, and there should be no functional changes with this patch. In preparation for allowing the end_io handler to pass ownership back to the block layer, rather than retain ownership of the request. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-flush.c | 10 +++++++--- block/blk-mq.c | 14 +++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 27705fc584a0..53202eff545e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -217,7 +217,8 @@ static void blk_flush_complete_seq(struct request *rq, blk_kick_flush(q, fq, cmd_flags); } -static void flush_end_io(struct request *flush_rq, blk_status_t error) +static enum rq_end_io_ret flush_end_io(struct request *flush_rq, + blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -231,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); - return; + return RQ_END_IO_NONE; } blk_account_io_flush(flush_rq); @@ -268,6 +269,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return RQ_END_IO_NONE; } bool is_flush_rq(struct request *rq) @@ -353,7 +355,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_flush_queue_rq(flush_rq, false); } -static void mq_flush_data_end_io(struct request *rq, blk_status_t error) +static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, + blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; @@ -375,6 +378,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); + return RQ_END_IO_NONE; } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index b32f70f38c6e..a21631de45b3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1001,7 +1001,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - rq->end_io(rq, error); + if (rq->end_io(rq, error) == RQ_END_IO_FREE) + blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } @@ -1295,12 +1296,13 @@ struct blk_rq_wait { blk_status_t ret; }; -static void blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); + return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) @@ -1534,10 +1536,12 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) void blk_mq_put_rq_ref(struct request *rq) { - if (is_flush_rq(rq)) - rq->end_io(rq, 0); - else if (req_ref_put_and_test(rq)) + if (is_flush_rq(rq)) { + if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + blk_mq_free_request(rq); + } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); + } } static bool blk_mq_check_expired(struct request *rq, void *priv) -- cgit v1.2.3 From ab3e1d3bbab9e973aeb4dd4603251578658a47ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 08:24:16 -0600 Subject: block: allow end_io based requests in the completion batch handling With end_io handlers now being able to potentially pass ownership of the request upon completion, we can allow requests with end_io handlers in the batch completion handling. Reviewed-by: Anuj Gupta Reviewed-by: Keith Busch Co-developed-by: Stefan Roesch Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a21631de45b3..8070b6c10e8d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -823,8 +823,10 @@ static void blk_complete_request(struct request *req) * can find how many bytes remain in the request * later. */ - req->bio = NULL; - req->__data_len = 0; + if (!req->end_io) { + req->bio = NULL; + req->__data_len = 0; + } } /** @@ -1055,6 +1057,13 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) rq_qos_done(rq->q, rq); + /* + * If end_io handler returns NONE, then it still has + * ownership of the request. + */ + if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + continue; + WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; -- cgit v1.2.3 From 557654025df5706785d395558244890dc4b2c875 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:40 +0530 Subject: block: add blk_rq_map_user_io Create a helper blk_rq_map_user_io for mapping of vectored as well as non-vectored requests. This will help in saving dupilcation of code at few places in scsi and nvme. Signed-off-by: Anuj Gupta Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 7693f8e3c454..0e37bbedd46c 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -611,6 +611,42 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_user); +int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data, + void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask, + bool vec, int iov_count, bool check_iter_count, int rw) +{ + int ret = 0; + + if (vec) { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov = fast_iov; + struct iov_iter iter; + + ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + return ret; + + if (iov_count) { + /* SG_IO howto says that the shorter of the two wins */ + iov_iter_truncate(&iter, buf_len); + if (check_iter_count && !iov_iter_count(&iter)) { + kfree(iov); + return -EINVAL; + } + } + + ret = blk_rq_map_user_iov(req->q, req, map_data, &iter, + gfp_mask); + kfree(iov); + } else if (buf_len) { + ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len, + gfp_mask); + } + return ret; +} +EXPORT_SYMBOL(blk_rq_map_user_io); + /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list -- cgit v1.2.3 From 32f1c71b15fc9cb8e964c3d0c15ca99a70cfe8a7 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:45 +0530 Subject: block: rename bio_map_put to blk_mq_map_bio_put This patch renames existing bio_map_put function to blk_mq_map_bio_put. Signed-off-by: Anuj Gupta Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-9-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 0e37bbedd46c..84b13a4158b7 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -231,7 +231,7 @@ out_bmd: return ret; } -static void bio_map_put(struct bio *bio) +static void blk_mq_map_bio_put(struct bio *bio) { if (bio->bi_opf & REQ_ALLOC_CACHE) { bio_put(bio); @@ -331,7 +331,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_map_put(bio); + blk_mq_map_bio_put(bio); return ret; } @@ -672,7 +672,7 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_map_put(next_bio); + blk_mq_map_bio_put(next_bio); } return ret; -- cgit v1.2.3 From ab89e8e7ca526ca04baaad2aa28172d336425d67 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:46 +0530 Subject: block: factor out blk_rq_map_bio_alloc helper Move bio allocation logic from bio_map_user_iov to a new helper blk_rq_map_bio_alloc. It is named so because functionality is opposite of what is done inside blk_mq_map_bio_put. This is a prep patch. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220930062749.152261-10-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 84b13a4158b7..d6ea377394a9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -241,17 +241,10 @@ static void blk_mq_map_bio_put(struct bio *bio) } } -static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, - gfp_t gfp_mask) +static struct bio *blk_rq_map_bio_alloc(struct request *rq, + unsigned int nr_vecs, gfp_t gfp_mask) { - unsigned int max_sectors = queue_max_hw_sectors(rq->q); - unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; - int ret; - int j; - - if (!iov_iter_count(iter)) - return -EINVAL; if (rq->cmd_flags & REQ_POLLED) { blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; @@ -259,13 +252,31 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, &fs_bio_set); if (!bio) - return -ENOMEM; + return NULL; } else { bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) - return -ENOMEM; + return NULL; bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); } + return bio; +} + +static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, + gfp_t gfp_mask) +{ + unsigned int max_sectors = queue_max_hw_sectors(rq->q); + unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); + struct bio *bio; + int ret; + int j; + + if (!iov_iter_count(iter)) + return -EINVAL; + + bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); + if (bio == NULL) + return -ENOMEM; while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; -- cgit v1.2.3 From 37987547932c89f15f9b76950040131ddb591a8b Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:47 +0530 Subject: block: extend functionality to map bvec iterator Extend blk_rq_map_user_iov so that it can handle bvec iterator, using the new blk_rq_map_user_bvec function. It maps the pages from bvec iterator into a bio and place the bio into request. This helper will be used by nvme for uring-passthrough path when IO is done using pre-mapped buffers. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-11-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index d6ea377394a9..34735626b00f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -548,6 +548,62 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(blk_rq_append_bio); +/* Prepare bio for passthrough IO given ITER_BVEC iter */ +static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) +{ + struct request_queue *q = rq->q; + size_t nr_iter = iov_iter_count(iter); + size_t nr_segs = iter->nr_segs; + struct bio_vec *bvecs, *bvprvp = NULL; + struct queue_limits *lim = &q->limits; + unsigned int nsegs = 0, bytes = 0; + struct bio *bio; + size_t i; + + if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) + return -EINVAL; + if (nr_segs > queue_max_segments(q)) + return -EINVAL; + + /* no iovecs to alloc, as we already have a BVEC iterator */ + bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); + if (bio == NULL) + return -ENOMEM; + + bio_iov_bvec_set(bio, (struct iov_iter *)iter); + blk_rq_bio_prep(rq, bio, nr_segs); + + /* loop to perform a bunch of sanity checks */ + bvecs = (struct bio_vec *)iter->bvec; + for (i = 0; i < nr_segs; i++) { + struct bio_vec *bv = &bvecs[i]; + + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, fallback to copy. + */ + if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { + blk_mq_map_bio_put(bio); + return -EREMOTEIO; + } + /* check full condition */ + if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) + goto put_bio; + if (bytes + bv->bv_len > nr_iter) + goto put_bio; + if (bv->bv_offset + bv->bv_len > PAGE_SIZE) + goto put_bio; + + nsegs++; + bytes += bv->bv_len; + bvprvp = bv; + } + return 0; +put_bio: + blk_mq_map_bio_put(bio); + return -EINVAL; +} + /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted @@ -567,24 +623,35 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - bool copy = false; + bool copy = false, map_bvec = false; unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; - if (!iter_is_iovec(iter)) - goto fail; - if (map_data) copy = true; else if (blk_queue_may_bounce(q)) copy = true; else if (iov_iter_alignment(iter) & align) copy = true; + else if (iov_iter_is_bvec(iter)) + map_bvec = true; + else if (!iter_is_iovec(iter)) + copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); + if (map_bvec) { + ret = blk_rq_map_user_bvec(rq, iter); + if (!ret) + return 0; + if (ret != -EREMOTEIO) + goto fail; + /* fall back to copying the data on limits mismatches */ + copy = true; + } + i = *iter; do { if (copy) -- cgit v1.2.3