diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-11 13:58:04 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-11 13:58:04 -0800 |
commit | 01d550f0fcc06c7292f79a6f1453aac122d1d2c8 (patch) | |
tree | 58b58ac1cb833af0469b1942774a382633bc6cda | |
parent | d05e626603d57936314816433db8bf1d34b5a504 (diff) | |
parent | 587371ed783b046f22ba7a5e1cc9a19ae35123b4 (diff) | |
download | linux-01d550f0fcc06c7292f79a6f1453aac122d1d2c8.tar.gz linux-01d550f0fcc06c7292f79a6f1453aac122d1d2c8.tar.bz2 linux-01d550f0fcc06c7292f79a6f1453aac122d1d2c8.zip |
Merge tag 'for-6.8/block-2024-01-08' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
"Pretty quiet round this time around. This contains:
- NVMe updates via Keith:
- nvme fabrics spec updates (Guixin, Max)
- nvme target udpates (Guixin, Evan)
- nvme attribute refactoring (Daniel)
- nvme-fc numa fix (Keith)
- MD updates via Song:
- Fix/Cleanup RCU usage from conf->disks[i].rdev (Yu Kuai)
- Fix raid5 hang issue (Junxiao Bi)
- Add Yu Kuai as Reviewer of the md subsystem
- Remove deprecated flavors (Song Liu)
- raid1 read error check support (Li Nan)
- Better handle events off-by-1 case (Alex Lyakas)
- Efficiency improvements for passthrough (Kundan)
- Support for mapping integrity data directly (Keith)
- Zoned write fix (Damien)
- rnbd fixes (Kees, Santosh, Supriti)
- Default to a sane discard size granularity (Christoph)
- Make the default max transfer size naming less confusing
(Christoph)
- Remove support for deprecated host aware zoned model (Christoph)
- Misc fixes (me, Li, Matthew, Min, Ming, Randy, liyouhong, Daniel,
Bart, Christoph)"
* tag 'for-6.8/block-2024-01-08' of git://git.kernel.dk/linux: (78 commits)
block: Treat sequential write preferred zone type as invalid
block: remove disk_clear_zoned
sd: remove the !ZBC && blk_queue_is_zoned case in sd_read_block_characteristics
drivers/block/xen-blkback/common.h: Fix spelling typo in comment
blk-cgroup: fix rcu lockdep warning in blkg_lookup()
blk-cgroup: don't use removal safe list iterators
block: floor the discard granularity to the physical block size
mtd_blkdevs: use the default discard granularity
bcache: use the default discard granularity
zram: use the default discard granularity
null_blk: use the default discard granularity
nbd: use the default discard granularity
ubd: use the default discard granularity
block: default the discard granularity to sector size
bcache: discard_granularity should not be smaller than a sector
block: remove two comments in bio_split_discard
block: rename and document BLK_DEF_MAX_SECTORS
loop: don't abuse BLK_DEF_MAX_SECTORS
aoe: don't abuse BLK_DEF_MAX_SECTORS
null_blk: don't cap max_hw_sectors to BLK_DEF_MAX_SECTORS
...
79 files changed, 1254 insertions, 2660 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 3a3db42fff2a..81f6b37e82c8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20079,6 +20079,7 @@ F: include/linux/property.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu <song@kernel.org> +R: Yu Kuai <yukuai3@huawei.com> L: linux-raid@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-raid/list/ diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 50206feac577..92ee2697ff39 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -798,7 +798,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) ubd_dev->cow.fd = err; } if (ubd_dev->no_trim == 0) { - ubd_dev->queue->limits.discard_granularity = SECTOR_SIZE; blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index ec8ac8cf6e1b..feef615e2c9c 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); + /* always report as many vecs as asked explicitly, not inline vecs */ + bip->bip_max_vcnt = nr_vecs; if (nr_vecs > inline_vecs) { - bip->bip_max_vcnt = nr_vecs; bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; } else { bip->bip_vec = bip->bip_inline_vecs; - bip->bip_max_vcnt = inline_vecs; } bip->bip_bio = bio; @@ -91,6 +91,47 @@ err: } EXPORT_SYMBOL(bio_integrity_alloc); +static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, + bool dirty) +{ + int i; + + for (i = 0; i < nr_vecs; i++) { + if (dirty && !PageCompound(bv[i].bv_page)) + set_page_dirty_lock(bv[i].bv_page); + unpin_user_page(bv[i].bv_page); + } +} + +static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) +{ + unsigned short nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *copy = &bip->bip_vec[1]; + size_t bytes = bip->bip_iter.bi_size; + struct iov_iter iter; + int ret; + + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + WARN_ON_ONCE(ret != bytes); + + bio_integrity_unpin_bvec(copy, nr_vecs, true); +} + +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) +{ + bool dirty = bio_data_dir(bip->bip_bio) == READ; + + if (bip->bip_flags & BIP_COPY_USER) { + if (dirty) + bio_integrity_uncopy_user(bip); + kfree(bvec_virt(bip->bip_vec)); + return; + } + + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio) if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); + else if (bip->bip_flags & BIP_INTEGRITY_USER) + bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, + unsigned int direction, u32 seed) +{ + bool write = direction == ITER_SOURCE; + struct bio_integrity_payload *bip; + struct iov_iter iter; + void *buf; + int ret; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (write) { + iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); + if (!copy_from_iter_full(buf, len, &iter)) { + ret = -EFAULT; + goto free_buf; + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + } else { + memset(buf, 0, len); + + /* + * We need to preserve the original bvec and the number of vecs + * in it for completion handling + */ + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1); + } + + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto free_buf; + } + + if (write) + bio_integrity_unpin_bvec(bvec, nr_vecs, false); + else + memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); + + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret != len) { + ret = -ENOMEM; + goto free_bip; + } + + bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; + bip->bip_iter.bi_sector = seed; + return 0; +free_bip: + bio_integrity_free(bio); +free_buf: + kfree(buf); + return ret; +} + +static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, u32 seed) +{ + struct bio_integrity_payload *bip; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); + bip->bip_flags |= BIP_INTEGRITY_USER; + bip->bip_iter.bi_sector = seed; + bip->bip_iter.bi_size = len; + return 0; +} + +static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, + int nr_vecs, ssize_t bytes, ssize_t offset) +{ + unsigned int nr_bvecs = 0; + int i, j; + + for (i = 0; i < nr_vecs; i = j) { + size_t size = min_t(size_t, bytes, PAGE_SIZE - offset); + struct folio *folio = page_folio(pages[i]); + + bytes -= size; + for (j = i + 1; j < nr_vecs; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) + break; + unpin_user_page(pages[j]); + size += next; + bytes -= next; + } + + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); + offset = 0; + nr_bvecs++; + } + + return nr_bvecs; +} + +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, + u32 seed) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; + struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + unsigned int direction, nr_bvecs; + struct iov_iter iter; + int ret, nr_vecs; + size_t offset; + bool copy; + + if (bio_integrity(bio)) + return -EINVAL; + if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q)) + return -E2BIG; + + if (bio_data_dir(bio) == READ) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + + iov_iter_ubuf(&iter, direction, ubuf, bytes); + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + if (nr_vecs > BIO_MAX_VECS) + return -E2BIG; + if (nr_vecs > UIO_FASTIOV) { + bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); + if (!bvec) + return -ENOMEM; + pages = NULL; + } + + copy = !iov_iter_is_aligned(&iter, align, align); + ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + if (unlikely(ret < 0)) + goto free_bvec; + + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + if (pages != stack_pages) + kvfree(pages); + if (nr_bvecs > queue_max_integrity_segments(q)) + copy = true; + + if (copy) + ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, + direction, seed); + else + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + if (ret) + goto release_pages; + if (bvec != stack_vec) + kfree(bvec); + + return 0; + +release_pages: + bio_integrity_unpin_bvec(bvec, nr_bvecs, false); +free_bvec: + if (bvec != stack_vec) + kfree(bvec); + return ret; +} +EXPORT_SYMBOL_GPL(bio_integrity_map_user); + /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for diff --git a/block/bio.c b/block/bio.c index 816d412c06e9..b9642a41f286 100644 --- a/block/bio.c +++ b/block/bio.c @@ -944,7 +944,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, if ((addr1 | mask) != (addr2 | mask)) return false; - if (bv->bv_len + len > queue_max_segment_size(q)) + if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset, same_page); } @@ -966,10 +966,13 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { + unsigned int max_size = max_sectors << SECTOR_SHIFT; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) + len = min3(len, max_size, queue_max_segment_size(q)); + if (len > max_size - bio->bi_iter.bi_size) return 0; if (bio->bi_vcnt > 0) { @@ -1145,13 +1148,22 @@ EXPORT_SYMBOL(bio_add_folio); void __bio_release_pages(struct bio *bio, bool mark_dirty) { - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (mark_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - bio_release_page(bio, bvec->bv_page); + bio_for_each_folio_all(fi, bio) { + struct page *page; + size_t done = 0; + + if (mark_dirty) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); + } + page = folio_page(fi.folio, fi.offset / PAGE_SIZE); + do { + bio_release_page(bio, page++); + done += PAGE_SIZE; + } while (done < fi.length); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1439,18 +1451,12 @@ EXPORT_SYMBOL(bio_free_pages); * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * - * The problem is that we cannot run set_page_dirty() from interrupt context + * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * - * We special-case compound pages here: normally this means reads into hugetlb - * pages. The logic in here doesn't really work right for compound pages - * because the VM does not uniformly chase down the head page in all cases. - * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't - * handle them at all. So we skip compound pages here at an early stage. - * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. @@ -1466,12 +1472,12 @@ EXPORT_SYMBOL(bio_free_pages); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); + bio_for_each_folio_all(fi, bio) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); @@ -1515,12 +1521,11 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; + struct folio_iter fi; unsigned long flags; - struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + bio_for_each_folio_all(fi, bio) { + if (!folio_test_dirty(fi.folio)) goto defer; } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4b48c2c44098..e303fd317313 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -575,13 +575,13 @@ static void blkg_destroy(struct blkcg_gq *blkg) static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; - struct blkcg_gq *blkg, *n; + struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) @@ -2064,6 +2064,9 @@ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; + if (blk_op_is_passthrough(bio->bi_opf)) + return; + rcu_read_lock(); if (bio->bi_blkg) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index fd482439afbc..b927a4a0ad03 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -252,7 +252,8 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, if (blkcg == &blkcg_root) return q->root_blkg; - blkg = rcu_dereference(blkcg->blkg_hint); + blkg = rcu_dereference_check(blkcg->blkg_hint, + lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; diff --git a/block/blk-core.c b/block/blk-core.c index 2eca76ccf4ee..11342af420d0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -772,6 +772,15 @@ void submit_bio_noacct(struct bio *bio) bio_clear_polled(bio); switch (bio_op(bio)) { |