diff options
62 files changed, 2816 insertions, 4793 deletions
diff --git a/block/Kconfig b/block/Kconfig index 941b2dca70db..69ccf7457ae1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -41,6 +41,9 @@ config BLK_RQ_ALLOC_TIME config BLK_CGROUP_RWSTAT bool +config BLK_CGROUP_PUNT_BIO + bool + config BLK_DEV_BSG_COMMON tristate diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bd50b55bdb61..18c922579719 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -56,7 +56,6 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; -static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 @@ -166,7 +165,9 @@ static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); +#endif /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); @@ -188,6 +189,9 @@ static void blkg_release(struct percpu_ref *ref) call_rcu(&blkg->rcu_head, __blkg_release); } +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO +static struct workqueue_struct *blkcg_punt_bio_wq; + static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, @@ -198,10 +202,10 @@ static void blkg_async_bio_workfn(struct work_struct *work) bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ - spin_lock_bh(&blkg->async_bio_lock); + spin_lock(&blkg->async_bio_lock); bio_list_merge(&bios, &blkg->async_bios); bio_list_init(&blkg->async_bios); - spin_unlock_bh(&blkg->async_bio_lock); + spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { @@ -214,6 +218,40 @@ static void blkg_async_bio_workfn(struct work_struct *work) blk_finish_plug(&plug); } +/* + * When a shared kthread issues a bio for a cgroup, doing so synchronously can + * lead to priority inversions as the kthread can be trapped waiting for that + * cgroup. Use this helper instead of submit_bio to punt the actual issuing to + * a dedicated per-blkcg work item to avoid such priority inversions. + */ +void blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + if (blkg->parent) { + spin_lock(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock(&blkg->async_bio_lock); + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + } else { + /* never bounce for the root cgroup */ + submit_bio(bio); + } +} +EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); + +static int __init blkcg_punt_bio_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_punt_bio_init); +#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ + /** * bio_blkcg_css - return the blkcg CSS associated with a bio * @bio: target bio @@ -269,10 +307,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); + blkg->blkcg = blkcg; +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); - blkg->blkcg = blkcg; +#endif u64_stats_init(&blkg->iostat.sync); for_each_possible_cpu(cpu) { @@ -1688,25 +1728,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); -bool __blkcg_punt_bio_submit(struct bio *bio) -{ - struct blkcg_gq *blkg = bio->bi_blkg; - - /* consume the flag first */ - bio->bi_opf &= ~REQ_CGROUP_PUNT; - - /* never bounce for the root cgroup */ - if (!blkg->parent) - return false; - - spin_lock_bh(&blkg->async_bio_lock); - bio_list_add(&blkg->async_bios, bio); - spin_unlock_bh(&blkg->async_bio_lock); - - queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); - return true; -} - /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a @@ -2085,16 +2106,5 @@ bool blk_cgroup_congested(void) return ret; } -static int __init blkcg_init(void) -{ - blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", - WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND | WQ_SYSFS, 0); - if (!blkcg_punt_bio_wq) - return -ENOMEM; - return 0; -} -subsys_initcall(blkcg_init); - module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 9c5078755e5e..e98d2c1be354 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -72,9 +72,10 @@ struct blkcg_gq { struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; +#endif union { struct work_struct async_bio_work; struct work_struct free_work; @@ -375,16 +376,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) -bool __blkcg_punt_bio_submit(struct bio *bio); - -static inline bool blkcg_punt_bio_submit(struct bio *bio) -{ - if (bio->bi_opf & REQ_CGROUP_PUNT) - return __blkcg_punt_bio_submit(bio); - else - return false; -} - static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); @@ -506,8 +497,6 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } - -static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } diff --git a/block/blk-core.c b/block/blk-core.c index 42926e6cb83c..478978dcb2bd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -830,9 +830,6 @@ EXPORT_SYMBOL(submit_bio_noacct); */ void submit_bio(struct bio *bio) { - if (blkcg_punt_bio_submit(bio)) - return; - if (bio_op(bio) == REQ_OP_READ) { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, bio_sectors(bio)); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 37b6bab90c83..66fa9ab2c046 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -2,6 +2,7 @@ config BTRFS_FS tristate "Btrfs filesystem support" + select BLK_CGROUP_PUNT_BIO select CRYPTO select CRYPTO_CRC32C select LIBCRC32C diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 726592868e9c..5379c4714905 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -31,11 +31,11 @@ struct btrfs_failed_bio { * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->inode = inode; + bbio->fs_info = fs_info; bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); @@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. */ -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_inode *inode, - btrfs_bio_end_io_t end_io, void *private) +struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private) { + struct btrfs_bio *bbio; struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); - return bio; + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, end_io, private); + return bbio; } -static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, - struct bio *orig, u64 map_length, - bool use_append) +static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) { - struct btrfs_bio *orig_bbio = btrfs_bio(orig); + struct btrfs_ordered_extent *ordered; + int ret; + + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + ret = btrfs_extract_ordered_extent(bbio, ordered); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); +} + +static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct btrfs_bio *orig_bbio, + u64 map_length, bool use_append) +{ + struct btrfs_bio *bbio; struct bio *bio; if (use_append) { unsigned int nr_segs; - bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, &btrfs_clone_bioset, map_length); } else { - bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, - &btrfs_clone_bioset); + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, + GFP_NOFS, &btrfs_clone_bioset); } - btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - - btrfs_bio(bio)->file_offset = orig_bbio->file_offset; - if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); + bbio->inode = orig_bbio->inode; + bbio->file_offset = orig_bbio->file_offset; + if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) orig_bbio->file_offset += map_length; atomic_inc(&orig_bbio->pending_ios); - return bio; + return bbio; } static void btrfs_orig_write_end_io(struct bio *bio); @@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(&repair_bbio->bio, mirror); + btrfs_submit_bio(repair_bbio, mirror); return; } @@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); repair_bbio = btrfs_bio(repair_bio); - btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); + repair_bbio->inode = failed_bbio->inode; repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bio, mirror); + btrfs_submit_bio(repair_bbio, mirror); return fbio; } @@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct btrfs_failed_bio *fbio = NULL; u32 offset = 0; + /* Read-repair requires the inode field to be set by the submitter. */ + ASSERT(inode); + /* * Hand off repair bios to the repair code as there is no upper level * submitter for them. @@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->bio.bi_opf & REQ_META) - bbio->end_io(bbio); - else + if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); + else + bbio->end_io(bbio); } static void btrfs_simple_end_io(struct bio *bio) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_device *dev = bio->bi_private; - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; btrfs_bio_counter_dec(fs_info); @@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && bbio->inode && + !(bbio->bio.bi_opf & REQ_META)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) dev->devid, bio->bi_iter.bi_size); btrfsic_check_bio(bio); - submit_bio(bio); + + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) + blkcg_punt_bio_submit(bio); + else + submit_bio(bio); } static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) @@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work) /* * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. + * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's + * context. This changes nothing when cgroups aren't in use. */ - bio->bi_opf |= REQ_CGROUP_PUNT; + bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } @@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio) * in order. */ if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; if (btrfs_is_zoned(fs_info)) return false; @@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } -static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; + struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << 9; u64 length = bio->bi_iter.bi_size; u64 map_length = length; @@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) map_length = min(map_length, fs_info->max_zone_append_size); if (map_length < length) { - bio = btrfs_split_bio(fs_info, bio, map_length, use_append); - bbio = btrfs_bio(bio); + bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bio = &bbio->bio; } /* * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) @@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + ret = btrfs_bio_extract_ordered_extent(bbio); if (ret) goto fail_put_bio; } @@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && + if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && @@ -686,9 +712,12 @@ fail: return true; } -void btrfs_submit_bio(struct bio *bio, int mirror_num) +void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) { - while (!btrfs_submit_chunk(bio, mirror_num)) + /* If bbio->inode is not populated, its file_offset must be 0. */ + ASSERT(bbio->inode || bbio->file_offset == 0); + + while (!btrfs_submit_chunk(bbio, mirror_num)) ; } @@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct btrfs_device *dev; + struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; struct bio bio; - u64 map_length = 0; - u64 sector; - struct btrfs_io_context *bioc = NULL; |
