summaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c829
1 files changed, 436 insertions, 393 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81737eff92f3..f0c97d25b4a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock);
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset);
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate);
-
/*
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
*
@@ -195,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = page_offset(locked_page);
- u64 page_end = page_start + PAGE_SIZE - 1;
-
+ u64 page_start, page_end;
struct page *page;
+ if (locked_page) {
+ page_start = page_offset(locked_page);
+ page_end = page_start + PAGE_SIZE - 1;
+ }
+
while (index <= end_index) {
/*
* For locked page, we will call end_extent_writepage() on it
@@ -212,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
- if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ if (locked_page && index == (page_start >> PAGE_SHIFT)) {
index++;
continue;
}
@@ -223,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
/*
* Here we just clear all Ordered bits for every page in the
- * range, then __endio_write_update_ordered() will handle
+ * range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
@@ -231,20 +230,23 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
put_page(page);
}
- /* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
- return;
- /*
- * In case this page belongs to the delalloc range being instantiated
- * then skip it, since the first page of a range is going to be
- * properly cleaned up by the caller of run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
- offset = page_offset(locked_page) + PAGE_SIZE;
+ if (locked_page) {
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_start + PAGE_SIZE)
+ return;
+ /*
+ * In case this page belongs to the delalloc range being
+ * instantiated then skip it, since the first page of a range is
+ * going to be properly cleaned up by the caller of
+ * run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
+ }
}
- return __endio_write_update_ordered(inode, offset, bytes, false);
+ return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -332,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = kmap_local_page(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
i++;
ptr += cur_size;
@@ -345,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
} else {
page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
write_extent_buffer(leaf, kaddr, ptr, size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_page(page);
}
btrfs_mark_buffer_dirty(leaf);
@@ -485,7 +487,7 @@ struct async_chunk {
struct page *locked_page;
u64 start;
u64 end;
- unsigned int write_flags;
+ blk_opf_t write_flags;
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
@@ -560,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
* will unlock the full page.
*/
if (fs_info->sectorsize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(end + 1, PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(end + 1))
return 0;
}
@@ -678,8 +680,8 @@ again:
* Thus we must also check against @actual_end, not just @end.
*/
if (blocksize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(round_up(actual_end, blocksize)))
goto cleanup_and_bail_uncompressed;
}
@@ -920,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
* can directly submit them without interruption.
*/
ret = cow_file_range(inode, locked_page, start, end, &page_started,
- &nr_written, 0);
+ &nr_written, 0, NULL);
/* Inline extent inserted, page gets unlocked and everything is done */
if (page_started) {
ret = 0;
goto out;
}
if (ret < 0) {
- if (locked_page)
+ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+ if (locked_page) {
+ const u64 page_start = page_offset(locked_page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
+
+ btrfs_page_set_error(inode->root->fs_info, locked_page,
+ page_start, PAGE_SIZE);
+ set_page_writeback(locked_page);
+ end_page_writeback(locked_page);
+ end_extent_writepage(locked_page, ret, page_start, page_end);
unlock_page(locked_page);
+ }
goto out;
}
@@ -1133,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* *page_started is set to one if we unlock locked_page and do everything
* required to start IO on it. It may be clean and already done with
* IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ * - All the pages are unlocked. IO is started.
+ * - Note that this can happen only on success
+ * - unlock == 1
+ * - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ * - On success, all the pages are locked for writing out them
+ * - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock)
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
+ u64 orig_start = start;
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
@@ -1329,18 +1365,62 @@ out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ /*
+ * If done_offset is non-NULL and ret == -EAGAIN, we expect the
+ * caller to write out the successfully allocated region and retry.
+ */
+ if (done_offset && ret == -EAGAIN) {
+ if (orig_start < start)
+ *done_offset = start - 1;
+ else
+ *done_offset = start;
+ return ret;
+ } else if (ret == -EAGAIN) {
+ /* Convert to -ENOSPC since the caller cannot retry. */
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Now, we have three regions to clean up:
+ *
+ * |-------(1)----|---(2)---|-------------(3)----------|
+ * `- orig_start `- start `- start + cur_alloc_size `- end
+ *
+ * We process each region below.
+ */
+
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
+ /*
+ * For the range (1). We have already instantiated the ordered extents
+ * for this region. They are cleaned up by
+ * btrfs_cleanup_ordered_extents() in e.g,
+ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+ * function.
+ *
+ * However, in case of unlock == 0, we still need to unlock the pages
+ * (except @locked_page) to ensure all the pages are unlocked.
+ */
+ if (!unlock && orig_start < start) {
+ if (!locked_page)
+ mapping_set_error(inode->vfs_inode.i_mapping, ret);
+ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+ locked_page, 0, page_ops);
+ }
+
/*
- * If we reserved an extent for our delalloc range (or a subrange) and
- * failed to create the respective ordered extent, then it means that
- * when we reserved the extent we decremented the extent's size from
- * the data space_info's bytes_may_use counter and incremented the
- * space_info's bytes_reserved counter by the same amount. We must make
- * sure extent_clear_unlock_delalloc() does not try to decrement again
- * the data space_info's bytes_may_use counter, therefore we do not pass
- * it the flag EXTENT_CLEAR_DATA_RESV.
+ * For the range (2). If we reserved an extent for our delalloc range
+ * (or a subrange) and failed to create the respective ordered extent,
+ * then it means that when we reserved the extent we decremented the
+ * extent's size from the data space_info's bytes_may_use counter and
+ * incremented the space_info's bytes_reserved counter by the same
+ * amount. We must make sure extent_clear_unlock_delalloc() does not try
+ * to decrement again the data space_info's bytes_may_use counter,
+ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
@@ -1350,12 +1430,19 @@ out_unlock:
page_ops);
start += cur_alloc_size;
if (start >= end)
- goto out;
+ return ret;
}
+
+ /*
+ * For the range (3). We never touched the region. In addition to the
+ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+ * space_info's bytes_may_use counter, reserved in
+ * btrfs_check_data_free_space().
+ */
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits | EXTENT_CLEAR_DATA_RESV,
page_ops);
- goto out;
+ return ret;
}
/*
@@ -1435,7 +1522,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
int i;
bool should_compress;
unsigned nofs_flag;
- const unsigned int write_flags = wbc_to_write_flags(wbc);
+ const blk_opf_t write_flags = wbc_to_write_flags(wbc);
unlock_extent(&inode->io_tree, start, end);
@@ -1538,19 +1625,42 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
u64 end, int *page_started,
unsigned long *nr_written)
{
+ u64 done_offset = end;
int ret;
+ bool locked_page_done = false;
- ret = cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 0);
- if (ret)
- return ret;
+ while (start <= end) {
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0, &done_offset);
+ if (ret && ret != -EAGAIN)
+ return ret;
- if (*page_started)
- return 0;
+ if (*page_started) {
+ ASSERT(ret == 0);
+ return 0;
+ }
+
+ if (ret == 0)
+ done_offset = end;
+
+ if (done_offset == start) {
+ struct btrfs_fs_info *info = inode->root->fs_info;
+
+ wait_var_event(&info->zone_finish_wait,
+ !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags));
+ continue;
+ }
+
+ if (!locked_page_done) {
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ }
+ locked_page_done = true;
+ extent_write_locked_range(&inode->vfs_inode, start, done_offset);
+
+ start = done_offset + 1;
+ }
- __set_page_dirty_nobuffers(locked_page);
- account_page_redirty(locked_page);
- extent_write_locked_range(&inode->vfs_inode, start, end);
*page_started = 1;
return 0;
@@ -1642,7 +1752,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
}
return cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 1);
+ nr_written, 1, NULL);
}
struct can_nocow_file_extent_args {
@@ -2115,7 +2225,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
page_started, nr_written);
else
ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ page_started, nr_written, 1, NULL);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
@@ -2131,6 +2241,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 size;
/* not delalloc, ignore it */
@@ -2138,7 +2249,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
return;
size = orig->end - orig->start + 1;
- if (size > BTRFS_MAX_EXTENT_SIZE) {
+ if (size > fs_info->max_extent_size) {
u32 num_extents;
u64 new_size;
@@ -2147,10 +2258,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
* applies here, just in reverse.
*/
new_size = orig->end - split + 1;
- num_extents = count_max_extents(new_size);
+ num_extents = count_max_extents(fs_info, new_size);
new_size = split - orig->start;
- num_extents += count_max_extents(new_size);
- if (count_max_extents(size) >= num_extents)
+ num_extents += count_max_extents(fs_info, new_size);
+ if (count_max_extents(fs_info, size) >= num_extents)
return;
}
@@ -2167,6 +2278,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size, old_size;
u32 num_extents;
@@ -2180,7 +2292,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
new_size = other->end - new->start + 1;
/* we're not bigger than the max, unreserve the space and go */
- if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ if (new_size <= fs_info->max_extent_size) {
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
@@ -2206,10 +2318,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
* this case.
*/
old_size = other->end - other->start + 1;
- num_extents = count_max_extents(old_size);
+ num_extents = count_max_extents(fs_info, old_size);
old_size = new->end - new->start + 1;
- num_extents += count_max_extents(old_size);
- if (count_max_extents(new_size) >= num_extents)
+ num_extents += count_max_extents(fs_info, old_size);
+ if (count_max_extents(fs_info, new_size) >= num_extents)
return;
spin_lock(&BTRFS_I(inode)->lock);
@@ -2274,21 +2386,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* list of inodes that have pending delalloc work to be done.
*/
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits)
+ u32 bits)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
spin_lock(&BTRFS_I(inode)->lock);
@@ -2303,7 +2415,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
- if (*bits & EXTENT_DEFRAG)
+ if (bits & EXTENT_DEFRAG)
BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
@@ -2312,7 +2424,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
state->start;
@@ -2325,14 +2437,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* accounting happens.
*/
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
- struct extent_state *state, unsigned *bits)
+ struct extent_state *state, u32 bits)
{
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
- if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
+ if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
spin_unlock(&inode->lock);
@@ -2343,7 +2455,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
bool do_list = !btrfs_is_free_space_inode(inode);
@@ -2356,7 +2468,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* don't need to call delalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_CLEAR_META_RESV &&
+ if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len, false);
@@ -2366,7 +2478,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
- (*bits & EXTENT_CLEAR_DATA_RESV))
+ (bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
@@ -2381,11 +2493,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
- if (*bits & EXTENT_ADD_INODE_BYTES)
+ if (bits & EXTENT_ADD_INODE_BYTES)
inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
@@ -2580,95 +2692,78 @@ out:
return errno_to_blk_status(ret);
}
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read: sync submit
- *
- * b) write without checksum: sync submit
- *
- * c) write with checksum:
- * c-1) if bio is issued by fsync: sync submit
- * (sync_writers != 0)
- *
- * c-2) if root is reloc root: sync submit
- * (only in case of buffered IO)
- *
- * c-3) otherwise: async submit
- */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type)
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- blk_status_t ret = 0;
- int skip_sum;
- int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ blk_status_t ret;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct page *page = bio_first_bvec_all(bio)->bv_page;
- loff_t file_offset = page_offset(page);
-
- ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ ret = extract_ordered_extent(bi, bio,
+ page_offset(bio_first_bvec_all(bio)->bv_page));
if (ret)
goto out;
}
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
- if (ret)
- goto out;
-
- if (compress_type != BTRFS_COMPRESS_NONE) {
- /*
- * btrfs_submit_compressed_read will handle completing
- * the bio if there were any errors, so just return
- * here.
- */
- btrfs_submit_compressed_read(inode, bio, mirror_num);
+ /*
+ * If we need to checksum, and the I/O is not issued by fsync and
+ * friends, that is ->sync_writers != 0, defer the submission to a
+ * workqueue to parallelize it.
+ *
+ * Csum items for reloc roots have already been cloned at this point,
+ * so they are handled as part of the no-checksum case.
+ */
+ if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !btrfs_is_data_reloc_root(bi->root)) {
+ if (!atomic_read(&bi->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ btrfs_submit_bio_start))
return;
- } else {
- /*
- * Lookup bio sums does extra checks around whether we
- * need to csum or not, which is why we ignore skip_sum
- * here.
- */
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
- if (ret)
- goto out;
- }
- goto mapit;
- } else if (async && !skip_sum) {
- /* csum items have already been cloned */
- if (btrfs_is_data_reloc_root(root))
- goto mapit;
- /* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num,
- 0, btrfs_submit_bio_start);
- goto out;
- } else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+
+ ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
if (ret)
goto out;
}
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
+out:
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
+}
-mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
-out:
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ /*
+ * btrfs_submit_compressed_read will handle completing the bio
+ * if there were any errors, so just return here.
+ */
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
+ }
+
+ /* Save the original iter for read repair */
+ btrfs_bio(bio)->iter = bio->bi_iter;
+
+ /*
+ * Lookup bio sums does extra checks around whether we need to csum or
+ * not, which is why we ignore skip_sum here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
+ return;
}
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
/*
@@ -3075,8 +3170,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
- num_bytes = ram_bytes = oe->truncated_len;
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ num_bytes = oe->truncated_len;
+ ram_bytes = num_bytes;
+ }
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3102,7 +3199,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
struct btrfs_root *root = inode->root;
@@ -3195,6 +3292,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
+ btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
@@ -3309,65 +3408,71 @@ out:
return ret;
}
-static void finish_ordered_fn(struct btrfs_work *work)
-{
- struct btrfs_ordered_extent *ordered_extent;
- ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
- btrfs_finish_ordered_io(ordered_extent);
-}
-
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
- finish_ordered_fn, uptodate);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
+}
+
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ */
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected)
+{
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ char *kaddr;
+
+ ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+
+ shash->tfm = fs_info->csum_shash;
+
+ kaddr = kmap_local_page(page) + pgoff;
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ kunmap_local(kaddr);
+
+ if (memcmp(csum, csum_expected, fs_info->csum_size))
+ return -EIO;
+ return 0;
}
/*
* check_data_csum - verify checksum of one sector of uncompressed data
* @inode: inode
- * @io_bio: btrfs_io_bio which contains the csum
+ * @bbio: btrfs_bio which contains the csum
* @bio_offset: offset to the beginning of the bio (in bytes)
* @page: page where is the data to be verified
* @pgoff: offset inside the page
- * @start: logical offset in the file
*
* The length of such check is always one sector size.
+ *
+ * When csum mismatch is detected, we will also report the error and fill the
+ * corrupted range with zero. (Thus it needs the extra parameters)
*/
-static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff,
- u64 start)
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
u32 len = fs_info->sectorsize;
- const u32 csum_size = fs_info->csum_size;
- unsigned int offset_sectors;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
ASSERT(pgoff + len <= PAGE_SIZE);
- offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
- kaddr = kmap_atomic(page);
- shash->tfm = fs_info->csum_shash;
-
- crypto_shash_digest(shash, kaddr + pgoff, len, csum);
- kunmap_atomic(kaddr);
-
- if (memcmp(csum, csum_expected, csum_size))
+ if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
goto zeroit;
-
return 0;
+
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- bbio->mirror_num);
+ btrfs_print_data_csum_error(BTRFS_I(inode),
+ bbio->file_offset + bio_offset,
+ csum, csum_expected, bbio->mirror_num);
if (bbio->device)
btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -3399,11 +3504,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 pg_off;
unsigned int result = 0;
- if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
- btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
- return 0;
- }
-
/*
* This only happens for NODATASUM or compressed read.
* Normally this should be covered by above check for compressed read
@@ -3436,8 +3536,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
EXTENT_NODATASUM);
continue;
}
- ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
- page_offset(page) + pg_off);
+ ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
root->fs_info->sectorsize_bits;
@@ -3576,7 +3675,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
- /* Bail out if the cleanup is already running. */
if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
@@ -3659,17 +3757,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*
* btrfs_find_orphan_roots() ran before us, which has
* found all deleted roots and loaded them into
- * fs_info->fs_roots. So here we can find if an
+ * fs_info->fs_roots_radix. So here we can find if an
* orphan item corresponds to a deleted root by looking
- * up the root from that xarray.
+ * up the root from that radix tree.
*/
- spin_lock(&fs_info->fs_roots_lock);
- dead_root = xa_load(&fs_info->fs_roots,
- (unsigned long)found_key.objectid);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)found_key.objectid);
if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
is_dead_root = 1;
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (is_dead_root) {
/* prevent this orphan from being found again */
@@ -3909,7 +4007,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in the delayed_nodes xarray.
+ * in delayed_nodes_tree.
*/
if (BTRFS_I(inode)->last_trans == fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4227,7 +4325,7 @@ skip_backref:
/*
* If we are in a rename context, we don't need to update anything in the
* log. That will be done later during the rename by btrfs_log_new_name().
- * Besides that, doing it here would only cause extra unncessary btree
+ * Besides that, doing it here would only cause extra unnecessary btree
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
@@ -4255,8 +4353,9 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
- dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
+ dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
@@ -4418,7 +4517,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime;
ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -4857,7 +4957,6 @@ again:
else
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
- flush_dcache_page(page);
}
btrfs_page_clear_checked(fs_info, page, block_start,
block_end + 1 - block_start);
@@ -5060,9 +5159,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
*/
if (newsize != oldsize) {
inode_inc_iversion(inode);
- if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
- inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
}
if (newsize > oldsize) {
@@ -5370,7 +5470,7 @@ void btrfs_evict_inode(struct inode *inode)
if (!rsv)
goto no_delete;
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ rsv->failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5762,14 +5862,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret != -ENOENT)
inode = ERR_PTR(ret);
else
- inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ inode = new_simple_dir(dir->i_sb, &location, root);
} else {
inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
- }
- if (root != sub_root)
btrfs_put_root(sub_root);
- if (!IS_ERR(inode) && root != sub_root) {
+ if (IS_ERR(inode))
+ return inode;
+
down_read(&fs_info->cleanup_work_sem);
if (!sb_rdonly(inode->i_sb))
ret = btrfs_orphan_cleanup(sub_root);
@@ -6