summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile6
-rw-r--r--fs/btrfs/backref.c33
-rw-r--r--fs/btrfs/bio.c557
-rw-r--r--fs/btrfs/bio.h67
-rw-r--r--fs/btrfs/block-group.c273
-rw-r--r--fs/btrfs/block-group.h24
-rw-r--r--fs/btrfs/btrfs_inode.h22
-rw-r--r--fs/btrfs/compression.c276
-rw-r--r--fs/btrfs/compression.h3
-rw-r--r--fs/btrfs/ctree.c62
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/defrag.c4
-rw-r--r--fs/btrfs/delayed-ref.c24
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/discard.c41
-rw-r--r--fs/btrfs/disk-io.c225
-rw-r--r--fs/btrfs/disk-io.h14
-rw-r--r--fs/btrfs/extent-io-tree.c10
-rw-r--r--fs/btrfs/extent-io-tree.h1
-rw-r--r--fs/btrfs/extent-tree.c181
-rw-r--r--fs/btrfs/extent-tree.h81
-rw-r--r--fs/btrfs/extent_io.c582
-rw-r--r--fs/btrfs/extent_io.h36
-rw-r--r--fs/btrfs/file-item.c72
-rw-r--r--fs/btrfs/file-item.h8
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/free-space-tree.c2
-rw-r--r--fs/btrfs/fs.c4
-rw-r--r--fs/btrfs/fs.h11
-rw-r--r--fs/btrfs/inode.c641
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/lru_cache.c166
-rw-r--r--fs/btrfs/lru_cache.h80
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/messages.c30
-rw-r--r--fs/btrfs/messages.h34
-rw-r--r--fs/btrfs/ordered-data.c25
-rw-r--r--fs/btrfs/ordered-data.h3
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/raid56.c334
-rw-r--r--fs/btrfs/raid56.h4
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c51
-rw-r--r--fs/btrfs/send.c684
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/sysfs.h3
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c34
-rw-r--r--fs/btrfs/transaction.h31
-rw-r--r--fs/btrfs/tree-log.c87
-rw-r--r--fs/btrfs/tree-log.h9
-rw-r--r--fs/btrfs/volumes.c116
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/btrfs/zoned.c146
-rw-r--r--fs/btrfs/zoned.h20
56 files changed, 2344 insertions, 2864 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 555c962fdad6..90d53209755b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,8 @@ condflags := \
$(call cc-option, -Wunused-but-set-variable) \
$(call cc-option, -Wunused-const-variable) \
$(call cc-option, -Wpacked-not-aligned) \
- $(call cc-option, -Wstringop-truncation)
+ $(call cc-option, -Wstringop-truncation) \
+ $(call cc-option, -Wmaybe-uninitialized)
subdir-ccflags-y += $(condflags)
# The following turn off the warnings enabled by -Wextra
subdir-ccflags-y += -Wno-missing-field-initializers
@@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
+ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
+ lru_cache.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 46851511b661..90e40d5ceccd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
struct btrfs_root *root,
u64 bytenr, int level, bool *is_shared)
{
+ const struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_backref_shared_cache_entry *entry;
+ if (!current->journal_info)
+ lockdep_assert_held(&fs_info->commit_root_sem);
+
if (!ctx->use_path_cache)
return false;
@@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
* could be a snapshot sharing this extent buffer.
*/
if (entry->is_shared &&
- entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ entry->gen != btrfs_get_last_root_drop_gen(fs_info))
return false;
*is_shared = entry->is_shared;
@@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
struct btrfs_root *root,
u64 bytenr, int level, bool is_shared)
{
+ const struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_backref_shared_cache_entry *entry;
u64 gen;
+ if (!current->journal_info)
+ lockdep_assert_held(&fs_info->commit_root_sem);
+
if (!ctx->use_path_cache)
return;
@@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
ASSERT(level >= 0);
if (is_shared)
- gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ gen = btrfs_get_last_root_drop_gen(fs_info);
else
gen = btrfs_root_last_snapshot(&root->root_item);
@@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
.have_delayed_delete_refs = false,
};
int level;
+ bool leaf_cached;
+ bool leaf_is_shared;
for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) {
if (ctx->prev_extents_cache[i].bytenr == bytenr)
@@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
walk_ctx.time_seq = elem.seq;
}
+ ctx->use_path_cache = true;
+
+ /*
+ * We may have previously determined that the current leaf is shared.
+ * If it is, then we have a data extent that is shared due to a shared
+ * subtree (caused by snapshotting) and we don't need to check for data
+ * backrefs. If the leaf is not shared, then we must do backref walking
+ * to determine if the data extent is shared through reflinks.
+ */
+ leaf_cached = lookup_backref_shared_cache(ctx, root,
+ ctx->curr_leaf_bytenr, 0,
+ &leaf_is_shared);
+ if (leaf_cached && leaf_is_shared) {
+ ret = 1;
+ goto out_trans;
+ }
+
walk_ctx.ignore_extent_item_pos = true;
walk_ctx.trans = trans;
walk_ctx.fs_info = fs_info;
@@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
/* -1 means we are in the bytenr of the data extent. */
level = -1;
ULIST_ITER_INIT(&uiter);
- ctx->use_path_cache = true;
while (1) {
bool is_shared;
bool cached;
@@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
ctx->prev_extents_cache_slot = slot;
}
+out_trans:
if (trans) {
btrfs_put_tree_mod_seq(fs_info, &elem);
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 8affc88b0e0a..d8b90f95b157 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -14,19 +14,31 @@
#include "dev-replace.h"
#include "rcu-string.h"
#include "zoned.h"
+#include "file-item.h"
static struct bio_set btrfs_bioset;
+static struct bio_set btrfs_clone_bioset;
+static struct bio_set btrfs_repair_bioset;
+static mempool_t btrfs_failed_bio_pool;
+
+struct btrfs_failed_bio {
+ struct btrfs_bio *bbio;
+ int num_copies;
+ atomic_t repair_count;
+};
/*
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
*/
-static inline void btrfs_bio_init(struct btrfs_bio *bbio,
- btrfs_bio_end_io_t end_io, void *private)
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+ btrfs_bio_end_io_t end_io, void *private)
{
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->inode = inode;
bbio->end_io = end_io;
bbio->private = private;
+ atomic_set(&bbio->pending_ios, 1);
}
/*
@@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio,
* a mempool.
*/
struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ struct btrfs_inode *inode,
btrfs_bio_end_io_t end_io, void *private)
{
struct bio *bio;
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
return bio;
}
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
- btrfs_bio_end_io_t end_io, void *private)
+static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
+ struct bio *orig, u64 map_length,
+ bool use_append)
{
+ struct btrfs_bio *orig_bbio = btrfs_bio(orig);
struct bio *bio;
- struct btrfs_bio *bbio;
- ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+ if (use_append) {
+ unsigned int nr_segs;
+
+ bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
+ &btrfs_clone_bioset, map_length);
+ } else {
+ bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
+ &btrfs_clone_bioset);
+ }
+ btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
- bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio, end_io, private);
+ btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
+ if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
+ orig_bbio->file_offset += map_length;
- bio_trim(bio, offset >> 9, size >> 9);
- bbio->iter = bio->bi_iter;
+ atomic_inc(&orig_bbio->pending_ios);
return bio;
}
+static void btrfs_orig_write_end_io(struct bio *bio);
+
+static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
+ struct btrfs_bio *orig_bbio)
+{
+ /*
+ * For writes we tolerate nr_mirrors - 1 write failures, so we can't
+ * just blindly propagate a write failure here. Instead increment the
+ * error count in the original I/O context so that it is guaranteed to
+ * be larger than the error tolerance.
+ */
+ if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
+ struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
+ struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
+
+ atomic_add(orig_bioc->max_errors, &orig_bioc->error);
+ } else {
+ orig_bbio->bio.bi_status = bbio->bio.bi_status;
+ }
+}
+
+static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
+{
+ if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
+ struct btrfs_bio *orig_bbio = bbio->private;
+
+ if (bbio->bio.bi_status)
+ btrfs_bbio_propagate_error(bbio, orig_bbio);
+ bio_put(&bbio->bio);
+ bbio = orig_bbio;
+ }
+
+ if (atomic_dec_and_test(&bbio->pending_ios))
+ bbio->end_io(bbio);
+}
+
+static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+{
+ if (cur_mirror == fbio->num_copies)
+ return cur_mirror + 1 - fbio->num_copies;
+ return cur_mirror + 1;
+}
+
+static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+{
+ if (cur_mirror == 1)
+ return fbio->num_copies;
+ return cur_mirror - 1;
+}
+
+static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
+{
+ if (atomic_dec_and_test(&fbio->repair_count)) {
+ btrfs_orig_bbio_end_io(fbio->bbio);
+ mempool_free(fbio, &btrfs_failed_bio_pool);
+ }
+}
+
+static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
+ struct btrfs_device *dev)
+{
+ struct btrfs_failed_bio *fbio = repair_bbio->private;
+ struct btrfs_inode *inode = repair_bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
+ int mirror = repair_bbio->mirror_num;
+
+ if (repair_bbio->bio.bi_status ||
+ !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
+ bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
+ repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
+
+ mirror = next_repair_mirror(fbio, mirror);
+ if (mirror == fbio->bbio->mirror_num) {
+ btrfs_debug(fs_info, "no mirror left");
+ fbio->bbio->bio.bi_status = BLK_STS_IOERR;
+ goto done;
+ }
+
+ btrfs_submit_bio(&repair_bbio->bio, mirror);
+ return;
+ }
+
+ do {
+ mirror = prev_repair_mirror(fbio, mirror);
+ btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
+ repair_bbio->file_offset, fs_info->sectorsize,
+ repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
+ bv->bv_page, bv->bv_offset, mirror);
+ } while (mirror != fbio->bbio->mirror_num);
+
+done:
+ btrfs_repair_done(fbio);
+ bio_put(&repair_bbio->bio);
+}
+
+/*
+ * Try to kick off a repair read to the next available mirror for a bad sector.
+ *
+ * This primarily tries to recover good data to serve the actual read request,
+ * but also tries to write the good data back to the bad mirror(s) when a
+ * read succeeded to restore the redundancy.
+ */
+static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
+ u32 bio_offset,
+ struct bio_vec *bv,
+ struct btrfs_failed_bio *fbio)
+{
+ struct btrfs_inode *inode = failed_bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
+ struct btrfs_bio *repair_bbio;
+ struct bio *repair_bio;
+ int num_copies;
+ int mirror;
+
+ btrfs_debug(fs_info, "repair read error: read error at %llu",
+ failed_bbio->file_offset + bio_offset);
+
+ num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
+ if (num_copies == 1) {
+ btrfs_debug(fs_info, "no copy to repair from");
+ failed_bbio->bio.bi_status = BLK_STS_IOERR;
+ return fbio;
+ }
+
+ if (!fbio) {
+ fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
+ fbio->bbio = failed_bbio;
+ fbio->num_copies = num_copies;
+ atomic_set(&fbio->repair_count, 1);
+ }
+
+ atomic_inc(&fbio->repair_count);
+
+ repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
+ &btrfs_repair_bioset);
+ repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
+ bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+
+ repair_bbio = btrfs_bio(repair_bio);
+ btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
+ repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
+
+ mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
+ btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
+ btrfs_submit_bio(repair_bio, mirror);
+ return fbio;
+}
+
+static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
+{
+ struct btrfs_inode *inode = bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bvec_iter *iter = &bbio->saved_iter;
+ blk_status_t status = bbio->bio.bi_status;
+ struct btrfs_failed_bio *fbio = NULL;
+ u32 offset = 0;
+
+ /*
+ * Hand off repair bios to the repair code as there is no upper level
+ * submitter for them.
+ */
+ if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
+ btrfs_end_repair_bio(bbio, dev);
+ return;
+ }
+
+ /* Clear the I/O error. A failed repair will reset it. */
+ bbio->bio.bi_status = BLK_STS_OK;
+
+ while (iter->bi_size) {
+ struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
+
+ bv.bv_len = min(bv.bv_len, sectorsize);
+ if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
+ fbio = repair_one_sector(bbio, offset, &bv, fbio);
+
+ bio_advance_iter_single(&bbio->bio, iter, sectorsize);
+ offset += sectorsize;
+ }
+
+ if (bbio->csum != bbio->csum_inline)
+ kfree(bbio->csum);
+
+ if (fbio)
+ btrfs_repair_done(fbio);
+ else
+ btrfs_orig_bbio_end_io(bbio);
+}
+
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
{
if (!dev || !dev->bdev)
@@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work)
{
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
- bbio->end_io(bbio);
+ /* Metadata reads are checked and repaired by the submitter. */
+ if (bbio->bio.bi_opf & REQ_META)
+ bbio->end_io(bbio);
+ else
+ btrfs_check_read_bio(bbio, bbio->bio.bi_private);
}
static void btrfs_simple_end_io(struct bio *bio)
{
- struct btrfs_fs_info *fs_info = bio->bi_private;
struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_device *dev = bio->bi_private;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
- btrfs_log_dev_io_error(bio, bbio->device);
+ btrfs_log_dev_io_error(bio, dev);
if (bio_op(bio) == REQ_OP_READ) {
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
- bbio->end_io(bbio);
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ btrfs_record_physical_zoned(bbio);
+ btrfs_orig_bbio_end_io(bbio);
}
}
@@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
- bbio->end_io(bbio);
+ if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
+ btrfs_check_read_bio(bbio, NULL);
+ else
+ btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
- bbio->end_io(bbio);
+ btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
*/
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 zone_start = round_down(physical, dev->fs_info->zone_size);
- if (btrfs_dev_is_sequential(dev, physical)) {
- u64 zone_start = round_down(physical,
- dev->fs_info->zone_size);
-
- bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
- } else {
- bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
- bio->bi_opf |= REQ_OP_WRITE;
- }
+ ASSERT(btrfs_dev_is_sequential(dev, physical));
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
}
btrfs_debug_in_rcu(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
@@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
+static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap, int mirror_num)
{
- u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = bio->bi_iter.bi_size;
- u64 map_length = length;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_io_stripe smap;
- int ret;
-
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num, 1);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
- return;
- }
-
- if (map_length < length) {
- btrfs_crit(fs_info,
- "mapping failed logical %llu bio len %llu len %llu",
- logical, length, map_length);
- BUG();
- }
+ /* Do not leak our private flag into the block layer. */
+ bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED;
if (!bioc) {
- /* Single mirror read/write fast path */
+ /* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
- btrfs_bio(bio)->device = smap.dev;
- bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- bio->bi_private = fs_info;
+ bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
+ bio->bi_private = smap->dev;
bio->bi_end_io = btrfs_simple_end_io;
- btrfs_submit_dev_bio(smap.dev, bio);
+ btrfs_submit_dev_bio(smap->dev, bio);
} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- /* Parity RAID write or read recovery */
+ /* Parity RAID write or read recovery. */
bio->bi_private = bioc;
bio->bi_end_io = btrfs_raid56_end_io;
if (bio_op(bio) == REQ_OP_READ)
@@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
else
raid56_parity_write(bio, bioc);
} else {
- /* Write to multiple mirrors */
+ /* Write to multiple mirrors. */
int total_devs = bioc->num_stripes;
- int dev_nr;
bioc->orig_bio = bio;
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
btrfs_submit_mirrored_bio(bioc, dev_nr);
}
}
+static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
+{
+ if (bbio->bio.bi_opf & REQ_META)
+ return btree_csum_one_bio(bbio);
+ return btrfs_csum_one_bio(bbio);
+}
+
+/*
+ * Async submit bios are used to offload expensive checksumming onto the worker
+ * threads.
+ */
+struct async_submit_bio {
+ struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe smap;
+ int mirror_num;
+ struct btrfs_work work;
+};
+
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time. All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the btree.
+ */
+static void run_one_async_start(struct btrfs_work *work)
+{
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ blk_status_t ret;
+
+ ret = btrfs_bio_csum(async->bbio);
+ if (ret)
+ async->bbio->bio.bi_status = ret;
+}
+
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time. All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the tree.
+ */
+static void run_one_async_done(struct btrfs_work *work)
+{
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct bio *bio = &async->bbio->bio;
+
+ /* If an error occurred we just want to clean up the bio and move on. */
+ if (bio->bi_status) {
+ btrfs_orig_bbio_end_io(async->bbio);
+ return;
+ }
+
+ /*
+ * All of the bios that pass through here are from async helpers.
+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+ * This changes nothing when cgroups aren't in use.
+ */
+ bio->bi_opf |= REQ_CGROUP_PUNT;
+ __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+ kfree(container_of(work, struct async_submit_bio, work));
+}
+
+static bool should_async_write(struct btrfs_bio *bbio)
+{
+ /*
+ * If the I/O is not issued by fsync and friends, (->sync_writers != 0),
+ * then try