diff options
Diffstat (limited to 'fs/btrfs')
56 files changed, 2344 insertions, 2864 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 555c962fdad6..90d53209755b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,8 @@ condflags := \ $(call cc-option, -Wunused-but-set-variable) \ $(call cc-option, -Wunused-const-variable) \ $(call cc-option, -Wpacked-not-aligned) \ - $(call cc-option, -Wstringop-truncation) + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers @@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ + lru_cache.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 46851511b661..90e40d5ceccd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return false; @@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct * could be a snapshot sharing this extent buffer. */ if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + entry->gen != btrfs_get_last_root_drop_gen(fs_info)) return false; *is_shared = entry->is_shared; @@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return; @@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx ASSERT(level >= 0); if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); + gen = btrfs_get_last_root_drop_gen(fs_info); else gen = btrfs_root_last_snapshot(&root->root_item); @@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .have_delayed_delete_refs = false, }; int level; + bool leaf_cached; + bool leaf_is_shared; for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { if (ctx->prev_extents_cache[i].bytenr == bytenr) @@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, walk_ctx.time_seq = elem.seq; } + ctx->use_path_cache = true; + + /* + * We may have previously determined that the current leaf is shared. + * If it is, then we have a data extent that is shared due to a shared + * subtree (caused by snapshotting) and we don't need to check for data + * backrefs. If the leaf is not shared, then we must do backref walking + * to determine if the data extent is shared through reflinks. + */ + leaf_cached = lookup_backref_shared_cache(ctx, root, + ctx->curr_leaf_bytenr, 0, + &leaf_is_shared); + if (leaf_cached && leaf_is_shared) { + ret = 1; + goto out_trans; + } + walk_ctx.ignore_extent_item_pos = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; @@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; @@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ctx->prev_extents_cache_slot = slot; } +out_trans: if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 8affc88b0e0a..d8b90f95b157 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -14,19 +14,31 @@ #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" +#include "file-item.h" static struct bio_set btrfs_bioset; +static struct bio_set btrfs_clone_bioset; +static struct bio_set btrfs_repair_bioset; +static mempool_t btrfs_failed_bio_pool; + +struct btrfs_failed_bio { + struct btrfs_bio *bbio; + int num_copies; + atomic_t repair_count; +}; /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + atomic_set(&bbio->pending_ios, 1); } /* @@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, * a mempool. */ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private) { struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); + btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct bio *orig, u64 map_length, + bool use_append) { + struct btrfs_bio *orig_bbio = btrfs_bio(orig); struct bio *bio; - struct btrfs_bio *bbio; - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + } + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); + btrfs_bio(bio)->file_offset = orig_bbio->file_offset; + if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + orig_bbio->file_offset += map_length; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; + atomic_inc(&orig_bbio->pending_ios); return bio; } +static void btrfs_orig_write_end_io(struct bio *bio); + +static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, + struct btrfs_bio *orig_bbio) +{ + /* + * For writes we tolerate nr_mirrors - 1 write failures, so we can't + * just blindly propagate a write failure here. Instead increment the + * error count in the original I/O context so that it is guaranteed to + * be larger than the error tolerance. + */ + if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { + struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; + struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + + atomic_add(orig_bioc->max_errors, &orig_bioc->error); + } else { + orig_bbio->bio.bi_status = bbio->bio.bi_status; + } +} + +static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; + + if (bbio->bio.bi_status) + btrfs_bbio_propagate_error(bbio, orig_bbio); + bio_put(&bbio->bio); + bbio = orig_bbio; + } + + if (atomic_dec_and_test(&bbio->pending_ios)) + bbio->end_io(bbio); +} + +static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == fbio->num_copies) + return cur_mirror + 1 - fbio->num_copies; + return cur_mirror + 1; +} + +static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == 1) + return fbio->num_copies; + return cur_mirror - 1; +} + +static void btrfs_repair_done(struct btrfs_failed_bio *fbio) +{ + if (atomic_dec_and_test(&fbio->repair_count)) { + btrfs_orig_bbio_end_io(fbio->bbio); + mempool_free(fbio, &btrfs_failed_bio_pool); + } +} + +static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + struct btrfs_device *dev) +{ + struct btrfs_failed_bio *fbio = repair_bbio->private; + struct btrfs_inode *inode = repair_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + int mirror = repair_bbio->mirror_num; + + if (repair_bbio->bio.bi_status || + !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); + repair_bbio->bio.bi_iter = repair_bbio->saved_iter; + + mirror = next_repair_mirror(fbio, mirror); + if (mirror == fbio->bbio->mirror_num) { + btrfs_debug(fs_info, "no mirror left"); + fbio->bbio->bio.bi_status = BLK_STS_IOERR; + goto done; + } + + btrfs_submit_bio(&repair_bbio->bio, mirror); + return; + } + + do { + mirror = prev_repair_mirror(fbio, mirror); + btrfs_repair_io_failure(fs_info, btrfs_ino(inode), + repair_bbio->file_offset, fs_info->sectorsize, + repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + bv->bv_page, bv->bv_offset, mirror); + } while (mirror != fbio->bbio->mirror_num); + +done: + btrfs_repair_done(fbio); + bio_put(&repair_bbio->bio); +} + +/* + * Try to kick off a repair read to the next available mirror for a bad sector. + * + * This primarily tries to recover good data to serve the actual read request, + * but also tries to write the good data back to the bad mirror(s) when a + * read succeeded to restore the redundancy. + */ +static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + u32 bio_offset, + struct bio_vec *bv, + struct btrfs_failed_bio *fbio) +{ + struct btrfs_inode *inode = failed_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_bio *repair_bbio; + struct bio *repair_bio; + int num_copies; + int mirror; + + btrfs_debug(fs_info, "repair read error: read error at %llu", + failed_bbio->file_offset + bio_offset); + + num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (num_copies == 1) { + btrfs_debug(fs_info, "no copy to repair from"); + failed_bbio->bio.bi_status = BLK_STS_IOERR; + return fbio; + } + + if (!fbio) { + fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); + fbio->bbio = failed_bbio; + fbio->num_copies = num_copies; + atomic_set(&fbio->repair_count, 1); + } + + atomic_inc(&fbio->repair_count); + + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; + bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); + btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); + btrfs_submit_bio(repair_bio, mirror); + return fbio; +} + +static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bvec_iter *iter = &bbio->saved_iter; + blk_status_t status = bbio->bio.bi_status; + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. + */ + if (bbio->bio.bi_pool == &btrfs_repair_bioset) { + btrfs_end_repair_bio(bbio, dev); + return; + } + + /* Clear the I/O error. A failed repair will reset it. */ + bbio->bio.bi_status = BLK_STS_OK; + + while (iter->bi_size) { + struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); + + bv.bv_len = min(bv.bv_len, sectorsize); + if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + fbio = repair_one_sector(bbio, offset, &bv, fbio); + + bio_advance_iter_single(&bbio->bio, iter, sectorsize); + offset += sectorsize; + } + + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + + if (fbio) + btrfs_repair_done(fbio); + else + btrfs_orig_bbio_end_io(bbio); +} + static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) @@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - bbio->end_io(bbio); + /* Metadata reads are checked and repaired by the submitter. */ + if (bbio->bio.bi_opf & REQ_META) + bbio->end_io(bbio); + else + btrfs_check_read_bio(bbio, bbio->bio.bi_private); } static void btrfs_simple_end_io(struct bio *bio) { - struct btrfs_fs_info *fs_info = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); + btrfs_log_dev_io_error(bio, dev); if (bio_op(bio) == REQ_OP_READ) { INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + btrfs_record_physical_zoned(bbio); + btrfs_orig_bbio_end_io(bbio); } } @@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - bbio->end_io(bbio); + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } + /* Do not leak our private flag into the block layer. */ + bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; if (!bioc) { - /* Single mirror read/write fast path */ + /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); + btrfs_submit_dev_bio(smap->dev, bio); } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ + /* Parity RAID write or read recovery. */ bio->bi_private = bioc; bio->bi_end_io = btrfs_raid56_end_io; if (bio_op(bio) == REQ_OP_READ) @@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror else raid56_parity_write(bio, bioc); } else { - /* Write to multiple mirrors */ + /* Write to multiple mirrors. */ int total_devs = bioc->num_stripes; - int dev_nr; bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) btrfs_submit_mirrored_bio(bioc, dev_nr); } } +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(bbio); + return btrfs_csum_one_bio(bbio); +} + +/* + * Async submit bios are used to offload expensive checksumming onto the worker + * threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the btree. + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on. */ + if (bio->bi_status) { + btrfs_orig_bbio_end_io(async->bbio); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* + * If the I/O is not issued by fsync and friends, (->sync_writers != 0), + * then try |