diff options
75 files changed, 2737 insertions, 2667 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index aac240430efe..ce083e99ef68 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->pending) > wq->thresh * 2; } +static void btrfs_init_workqueue(struct btrfs_workqueue *wq, + struct btrfs_fs_info *fs_info) +{ + wq->fs_info = fs_info; + atomic_set(&wq->pending, 0); + INIT_LIST_HEAD(&wq->ordered_list); + spin_lock_init(&wq->list_lock); + spin_lock_init(&wq->thres_lock); +} + struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, if (!ret) return NULL; - ret->fs_info = fs_info; + btrfs_init_workqueue(ret, fs_info); + ret->limit_active = limit_active; - atomic_set(&ret->pending, 0); if (thresh == 0) thresh = DFT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ @@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, return NULL; } - INIT_LIST_HEAD(&ret->ordered_list); - spin_lock_init(&ret->list_lock); - spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name); + return ret; +} + +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags) +{ + struct btrfs_workqueue *ret; + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + btrfs_init_workqueue(ret, fs_info); + + /* Ordered workqueues don't allow @max_active adjustments. */ + ret->limit_active = 1; + ret->current_active = 1; + ret->thresh = NO_THRESHOLD; + + ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + trace_btrfs_workqueue_alloc(ret, name); return ret; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 6e2596ddae10..30f66c5e2e6e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, unsigned int flags, int limit_active, int thresh); +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b3ad0f51e616..12b12443efaa 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,6 +27,17 @@ struct btrfs_failed_bio { atomic_t repair_count; }; +/* Is this a data path I/O that needs storage layer checksum and repair? */ +static inline bool is_data_bbio(struct btrfs_bio *bbio) +{ + return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); +} + +static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +{ + return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; +} + /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. @@ -61,20 +72,6 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, return bbio; } -static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) -{ - struct btrfs_ordered_extent *ordered; - int ret; - - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; - ret = btrfs_extract_ordered_extent(bbio, ordered); - btrfs_put_ordered_extent(ordered); - - return errno_to_blk_status(ret); -} - static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, u64 map_length, bool use_append) @@ -95,13 +92,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; bbio->file_offset = orig_bbio->file_offset; - if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) - orig_bbio->file_offset += map_length; - + orig_bbio->file_offset += map_length; + if (bbio_has_ordered_extent(bbio)) { + refcount_inc(&orig_bbio->ordered->refs); + bbio->ordered = orig_bbio->ordered; + } atomic_inc(&orig_bbio->pending_ios); return bbio; } +/* Free a bio that was never submitted to the underlying device. */ +static void btrfs_cleanup_bio(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); +} + +static void __btrfs_bio_end_io(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } +} + +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + __btrfs_bio_end_io(bbio); +} + static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -130,12 +155,12 @@ static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) if (bbio->bio.bi_status) btrfs_bbio_propagate_error(bbio, orig_bbio); - bio_put(&bbio->bio); + btrfs_cleanup_bio(bbio); bbio = orig_bbio; } if (atomic_dec_and_test(&bbio->pending_ios)) - bbio->end_io(bbio); + __btrfs_bio_end_io(bbio); } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -327,7 +352,7 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) + if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else btrfs_orig_bbio_end_io(bbio); @@ -348,7 +373,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_orig_bbio_end_io(bbio); } @@ -361,8 +386,7 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && bbio->inode && - !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -472,13 +496,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - /* Do not leak our private flag into the block layer. */ - bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; - if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; btrfs_submit_dev_bio(smap->dev, bio); @@ -574,27 +597,20 @@ static void run_one_async_free(struct btrfs_work *work) static bool should_async_write(struct btrfs_bio *bbio) { - /* - * If the I/O is not issued by fsync and friends, (->sync_writers != 0), - * then try to defer the submission to a workqueue to parallelize the - * checksum calculation. - */ - if (atomic_read(&bbio->inode->sync_writers)) + /* Submit synchronously if the checksum implementation is fast. */ + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* - * Submit metadata writes synchronously if the checksum implementation - * is fast, or we are on a zoned device that wants I/O to be submitted - * in order. + * Try to defer the submission to a workqueue to parallelize the + * checksum calculation unless the I/O is issued synchronously. */ - if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + if (op_is_sync(bbio->bio.bi_opf)) + return false; - if (btrfs_is_zoned(fs_info)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - } + /* Zoned devices require I/O to be submitted in order. */ + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + return false; return true; } @@ -622,10 +638,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - if (op_is_sync(bbio->bio.bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); return true; } @@ -635,7 +648,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; - u64 logical = bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; u64 map_length = length; bool use_append = btrfs_use_zone_append(bbio); @@ -645,8 +658,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) int error; btrfs_bio_counter_inc_blocked(fs_info); - error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); if (error) { ret = errno_to_blk_status(error); goto fail; @@ -665,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) @@ -676,9 +689,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_bio_extract_ordered_extent(bbio); - if (ret) - goto fail_put_bio; } /* @@ -695,6 +705,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; + } else if (use_append) { + ret = btrfs_alloc_dummy_sum(bbio); + if (ret) + goto fail_put_bio; } } @@ -704,7 +718,7 @@ done: fail_put_bio: if (map_length < length) - bio_put(bio); + btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); btrfs_bio_end_io(orig_bbio, ret); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index a8eca3a65673..ca79decee060 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -39,8 +39,8 @@ struct btrfs_bio { union { /* - * Data checksumming and original I/O information for internal - * use in the btrfs_submit_bio machinery. + * For data reads: checksumming and original I/O information. + * (for internal use in the btrfs_submit_bio machinery only) */ struct { u8 *csum; @@ -48,7 +48,20 @@ struct btrfs_bio { struct bvec_iter saved_iter; }; - /* For metadata parentness verification. */ + /* + * For data writes: + * - ordered extent covering the bio + * - pointer to the checksums for this bio + * - original physical address from the allocator + * (for zone append only) + */ + struct { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sums; + u64 orig_physical; + }; + + /* For metadata reads: parentness verification. */ struct btrfs_tree_parent_check parent_check; }; @@ -84,15 +97,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} - -/* Bio only refers to one ordered extent. */ -#define REQ_BTRFS_ONE_ORDERED REQ_DRV +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e97af2e510c3..48ae509f2ac2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -95,14 +95,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) } allowed &= flags; - if (allowed & BTRFS_BLOCK_GROUP_RAID6) + /* Select the highest-redundancy RAID level. */ + if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) + allowed = BTRFS_BLOCK_GROUP_RAID1C4; + else if (allowed & BTRFS_BLOCK_GROUP_RAID6) allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) + allowed = BTRFS_BLOCK_GROUP_RAID1C3; else if (allowed & BTRFS_BLOCK_GROUP_RAID5) allowed = BTRFS_BLOCK_GROUP_RAID5; else if (allowed & BTRFS_BLOCK_GROUP_RAID10) allowed = BTRFS_BLOCK_GROUP_RAID10; else if (allowed & BTRFS_BLOCK_GROUP_RAID1) allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_DUP) + allowed = BTRFS_BLOCK_GROUP_DUP; else if (allowed & BTRFS_BLOCK_GROUP_RAID0) allowed = BTRFS_BLOCK_GROUP_RAID0; @@ -1633,11 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; + trace_btrfs_add_unused_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); - trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } else { + /* Pull out the block group from the reclaim_bgs list. */ + list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -1791,8 +1801,15 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_unlock(&bg->lock); - /* Get out fast, in case we're unmounting the filesystem */ - if (btrfs_fs_closing(fs_info)) { + /* + * Get out fast, in case we're read-only or unmounting the + * filesystem. It is OK to drop block groups from the list even + * for the read-only case. As we did sb_start_write(), + * "mount -o remount,ro" won't happen and read-only filesystem + * means it is forced read-only due to a fatal error. So, it + * never gets back to read-write to let us reclaim again. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { up_write(&space_info->groups_sem); goto next; } @@ -1823,11 +1840,27 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: + if (ret) + btrfs_mark_bg_to_reclaim(bg); btrfs_put_block_group(bg); + + mutex_unlock(&fs_info->reclaim_bgs_lock); + /* + * Reclaiming all the block groups in the list can take really + * long. Prioritize cleaning up unused block groups. + */ + btrfs_delete_unused_bgs(fs_info); + /* + * If we are interrupted by a balance, we can just bail out. The + * cleaner thread restart again if necessary. + */ + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) + goto end; spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); +end: btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } @@ -3521,9 +3554,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_dirty(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); + set_extent_bit(&trans->transaction->pinned_extents, + bytenr, bytenr + num_bytes - 1, + EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index cc0e4b37db2d..f204addc3fe8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -162,7 +162,14 @@ struct btrfs_block_group { */ struct list_head cluster_list; - /* For delayed block group creation or deletion of empty block groups */ + /* + * Used for several lists: + * + * 1) struct btrfs_fs_info::unused_bgs + * 2) struct btrfs_fs_info::reclaim_bgs |
