diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 11:41:38 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 11:41:38 -0700 |
| commit | cc423f6337d0a5ff1906f3b3d465d28c0d1705f6 (patch) | |
| tree | fafc40aa7dc3ecd9800239f647d4fe21ee5db6af /fs/btrfs | |
| parent | e940efa936be65866db9ce20798b13fdc6b3891a (diff) | |
| parent | 8a4a0b2a3eaf75ca8854f856ef29690c12b2f531 (diff) | |
| download | linux-cc423f6337d0a5ff1906f3b3d465d28c0d1705f6.tar.gz linux-cc423f6337d0a5ff1906f3b3d465d28c0d1705f6.tar.bz2 linux-cc423f6337d0a5ff1906f3b3d465d28c0d1705f6.zip | |
Merge tag 'for-6.5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"Mainly core changes, refactoring and optimizations.
Performance is improved in some areas, overall there may be a
cumulative improvement due to refactoring that removed lookups in the
IO path or simplified IO submission tracking.
Core:
- submit IO synchronously for fast checksums (crc32c and xxhash),
remove high priority worker kthread
- read extent buffer in one go, simplify IO tracking, bio submission
and locking
- remove additional tracking of redirtied extent buffers, originally
added for zoned mode but actually not needed
- track ordered extent pointer in bio to avoid rbtree lookups during
IO
- scrub, use recovered data stripes as cache to avoid unnecessary
read
- in zoned mode, optimize logical to physical mappings of extents
- remove PageError handling, not set by VFS nor writeback
- cleanups, refactoring, better structure packing
- lots of error handling improvements
- more assertions, lockdep annotations
- print assertion failure with the exact line where it happens
- tracepoint updates
- more debugging prints
Performance:
- speedup in fsync(), better tracking of inode logged status can
avoid transaction commit
- IO path structures track logical offsets in data structures and
does not need to look it up
User visible changes:
- don't commit transaction for every created subvolume, this can
reduce time when many subvolumes are created in a batch
- print affected files when relocation fails
- trigger orphan file cleanup during START_SYNC ioctl
Notable fixes:
- fix crash when disabling quota and relocation
- fix crashes when removing roots from drity list
- fix transacion abort during relocation when converting from newer
profiles not covered by fallback
- in zoned mode, stop reclaiming block groups if filesystem becomes
read-only
- fix rare race condition in tree mod log rewind that can miss some
btree node slots
- with enabled fsverity, drop up-to-date page bit in case the
verification fails"
* tag 'for-6.5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (194 commits)
btrfs: fix race between quota disable and relocation
btrfs: add comment to struct btrfs_fs_info::dirty_cowonly_roots
btrfs: fix race when deleting free space root from the dirty cow roots list
btrfs: fix race when deleting quota root from the dirty cow roots list
btrfs: tracepoints: also show actual number of the outstanding extents
btrfs: update i_version in update_dev_time
btrfs: make btrfs_compressed_bioset static
btrfs: add handling for RAID1C23/DUP to btrfs_reduce_alloc_profile
btrfs: scrub: remove btrfs_fs_info::scrub_wr_completion_workers
btrfs: scrub: remove scrub_ctx::csum_list member
btrfs: do not BUG_ON after failure to migrate space during truncation
btrfs: do not BUG_ON on failure to get dir index for new snapshot
btrfs: send: do not BUG_ON() on unexpected symlink data extent
btrfs: do not BUG_ON() when dropping inode items from log root
btrfs: replace BUG_ON() at split_item() with proper error handling
btrfs: do not BUG_ON() on tree mod log failures at btrfs_del_ptr()
btrfs: do not BUG_ON() on tree mod log failures at insert_ptr()
btrfs: do not BUG_ON() on tree mod log failure at insert_new_root()
btrfs: do not BUG_ON() on tree mod log failures at push_nodes_for_insert()
btrfs: abort transaction at update_ref_for_cow() when ref count is zero
...
Diffstat (limited to 'fs/btrfs')
73 files changed, 2702 insertions, 2662 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index aac240430efe..ce083e99ef68 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->pending) > wq->thresh * 2; } +static void btrfs_init_workqueue(struct btrfs_workqueue *wq, + struct btrfs_fs_info *fs_info) +{ + wq->fs_info = fs_info; + atomic_set(&wq->pending, 0); + INIT_LIST_HEAD(&wq->ordered_list); + spin_lock_init(&wq->list_lock); + spin_lock_init(&wq->thres_lock); +} + struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, if (!ret) return NULL; - ret->fs_info = fs_info; + btrfs_init_workqueue(ret, fs_info); + ret->limit_active = limit_active; - atomic_set(&ret->pending, 0); if (thresh == 0) thresh = DFT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ @@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, return NULL; } - INIT_LIST_HEAD(&ret->ordered_list); - spin_lock_init(&ret->list_lock); - spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name); + return ret; +} + +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags) +{ + struct btrfs_workqueue *ret; + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + btrfs_init_workqueue(ret, fs_info); + + /* Ordered workqueues don't allow @max_active adjustments. */ + ret->limit_active = 1; + ret->current_active = 1; + ret->thresh = NO_THRESHOLD; + + ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + trace_btrfs_workqueue_alloc(ret, name); return ret; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 6e2596ddae10..30f66c5e2e6e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, unsigned int flags, int limit_active, int thresh); +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b3ad0f51e616..12b12443efaa 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,6 +27,17 @@ struct btrfs_failed_bio { atomic_t repair_count; }; +/* Is this a data path I/O that needs storage layer checksum and repair? */ +static inline bool is_data_bbio(struct btrfs_bio *bbio) +{ + return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); +} + +static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +{ + return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; +} + /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. @@ -61,20 +72,6 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, return bbio; } -static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) -{ - struct btrfs_ordered_extent *ordered; - int ret; - - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; - ret = btrfs_extract_ordered_extent(bbio, ordered); - btrfs_put_ordered_extent(ordered); - - return errno_to_blk_status(ret); -} - static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, u64 map_length, bool use_append) @@ -95,13 +92,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; bbio->file_offset = orig_bbio->file_offset; - if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) - orig_bbio->file_offset += map_length; - + orig_bbio->file_offset += map_length; + if (bbio_has_ordered_extent(bbio)) { + refcount_inc(&orig_bbio->ordered->refs); + bbio->ordered = orig_bbio->ordered; + } atomic_inc(&orig_bbio->pending_ios); return bbio; } +/* Free a bio that was never submitted to the underlying device. */ +static void btrfs_cleanup_bio(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); +} + +static void __btrfs_bio_end_io(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } +} + +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + __btrfs_bio_end_io(bbio); +} + static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -130,12 +155,12 @@ static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) if (bbio->bio.bi_status) btrfs_bbio_propagate_error(bbio, orig_bbio); - bio_put(&bbio->bio); + btrfs_cleanup_bio(bbio); bbio = orig_bbio; } if (atomic_dec_and_test(&bbio->pending_ios)) - bbio->end_io(bbio); + __btrfs_bio_end_io(bbio); } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -327,7 +352,7 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) + if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else btrfs_orig_bbio_end_io(bbio); @@ -348,7 +373,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_orig_bbio_end_io(bbio); } @@ -361,8 +386,7 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && bbio->inode && - !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -472,13 +496,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - /* Do not leak our private flag into the block layer. */ - bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; - if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; btrfs_submit_dev_bio(smap->dev, bio); @@ -574,27 +597,20 @@ static void run_one_async_free(struct btrfs_work *work) static bool should_async_write(struct btrfs_bio *bbio) { - /* - * If the I/O is not issued by fsync and friends, (->sync_writers != 0), - * then try to defer the submission to a workqueue to parallelize the - * checksum calculation. - */ - if (atomic_read(&bbio->inode->sync_writers)) + /* Submit synchronously if the checksum implementation is fast. */ + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* - * Submit metadata writes synchronously if the checksum implementation - * is fast, or we are on a zoned device that wants I/O to be submitted - * in order. + * Try to defer the submission to a workqueue to parallelize the + * checksum calculation unless the I/O is issued synchronously. */ - if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + if (op_is_sync(bbio->bio.bi_opf)) + return false; - if (btrfs_is_zoned(fs_info)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - } + /* Zoned devices require I/O to be submitted in order. */ + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + return false; return true; } @@ -622,10 +638,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - if (op_is_sync(bbio->bio.bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); return true; } @@ -635,7 +648,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; - u64 logical = bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; u64 map_length = length; bool use_append = btrfs_use_zone_append(bbio); @@ -645,8 +658,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) int error; btrfs_bio_counter_inc_blocked(fs_info); - error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); if (error) { ret = errno_to_blk_status(error); goto fail; @@ -665,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) @@ -676,9 +689,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_bio_extract_ordered_extent(bbio); - if (ret) - goto fail_put_bio; } /* @@ -695,6 +705,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; + } else if (use_append) { + ret = btrfs_alloc_dummy_sum(bbio); + if (ret) + goto fail_put_bio; } } @@ -704,7 +718,7 @@ done: fail_put_bio: if (map_length < length) - bio_put(bio); + btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); btrfs_bio_end_io(orig_bbio, ret); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index a8eca3a65673..ca79decee060 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -39,8 +39,8 @@ struct btrfs_bio { union { /* - * Data checksumming and original I/O information for internal - * use in the btrfs_submit_bio machinery. + * For data reads: checksumming and original I/O information. + * (for internal use in the btrfs_submit_bio machinery only) */ struct { u8 *csum; @@ -48,7 +48,20 @@ struct btrfs_bio { struct bvec_iter saved_iter; }; - /* For metadata parentness verification. */ + /* + * For data writes: + * - ordered extent covering the bio + * - pointer to the checksums for this bio + * - original physical address from the allocator + * (for zone append only) + */ + struct { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sums; + u64 orig_physical; + }; + + /* For metadata reads: parentness verification. */ struct btrfs_tree_parent_check parent_check; }; @@ -84,15 +97,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} - -/* Bio only refers to one ordered extent. */ -#define REQ_BTRFS_ONE_ORDERED REQ_DRV +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e97af2e510c3..48ae509f2ac2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -95,14 +95,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) } allowed &= flags; - if (allowed & BTRFS_BLOCK_GROUP_RAID6) + /* Select the highest-redundancy RAID level. */ + if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) + allowed = BTRFS_BLOCK_GROUP_RAID1C4; + else if (allowed & BTRFS_BLOCK_GROUP_RAID6) allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) + allowed = BTRFS_BLOCK_GROUP_RAID1C3; else if (allowed & BTRFS_BLOCK_GROUP_RAID5) allowed = BTRFS_BLOCK_GROUP_RAID5; else if (allowed & BTRFS_BLOCK_GROUP_RAID10) allowed = BTRFS_BLOCK_GROUP_RAID10; else if (allowed & BTRFS_BLOCK_GROUP_RAID1) allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_DUP) + allowed = BTRFS_BLOCK_GROUP_DUP; else if (allowed & BTRFS_BLOCK_GROUP_RAID0) allowed = BTRFS_BLOCK_G |
